Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
23.90% |
60 / 251 |
|
37.04% |
10 / 27 |
CRAP | |
0.00% |
0 / 1 |
SerializerState | |
23.90% |
60 / 251 |
|
37.04% |
10 / 27 |
3987.45 | |
0.00% |
0 / 1 |
solWikitextRegexp | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
6 | |||
solRegexp | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
__construct | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
1 | |||
getEnv | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
initMode | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
appendSep | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
updateSep | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
resetSep | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
resetCurrLine | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
flushLine | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getOrigSrc | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
3.07 | |||
isValidDSR | |
0.00% |
0 / 45 |
|
0.00% |
0 / 1 |
342 | |||
updateModificationFlags | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
sepIntroducedSOL | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
pushToCurrLine | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
emitSep | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
emitSepForNode | |
0.00% |
0 / 30 |
|
0.00% |
0 / 1 |
272 | |||
recoverTrimmedWhitespace | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
emitChunk | |
40.48% |
17 / 42 |
|
0.00% |
0 / 1 |
156.81 | |||
serializeChildren | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
5 | |||
kickOffSerialize | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
serializeChildrenToString | |
0.00% |
0 / 33 |
|
0.00% |
0 / 1 |
2 | |||
serializeLinkChildrenToString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
serializeCaptionChildrenToString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
serializeIndentPreChildrenToString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
openAnnotationRange | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
closeAnnotationRange | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Composer\Semver\Semver; |
7 | use stdClass; |
8 | use Wikimedia\Assert\Assert; |
9 | use Wikimedia\Parsoid\Config\Env; |
10 | use Wikimedia\Parsoid\Core\DomSourceRange; |
11 | use Wikimedia\Parsoid\Core\SelectiveUpdateData; |
12 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
13 | use Wikimedia\Parsoid\DOM\Element; |
14 | use Wikimedia\Parsoid\DOM\Node; |
15 | use Wikimedia\Parsoid\DOM\Text; |
16 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
17 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ConstrainedText; |
18 | use Wikimedia\Parsoid\Tokens\SourceRange; |
19 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
20 | use Wikimedia\Parsoid\Utils\DOMCompat; |
21 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
22 | use Wikimedia\Parsoid\Utils\DOMUtils; |
23 | use Wikimedia\Parsoid\Utils\PHPUtils; |
24 | use Wikimedia\Parsoid\Utils\Utils; |
25 | use Wikimedia\Parsoid\Utils\WTUtils; |
26 | |
27 | /** |
28 | * State object for the wikitext serializers. |
29 | */ |
30 | class SerializerState { |
31 | |
32 | /** |
33 | * Regexp for checking if what we have consumed wikimarkup that has special meaning at the |
34 | * beginning of the line, and is indeed at the beginning of the line (modulo comments and |
35 | * other ignored elements). |
36 | * |
37 | * @return string |
38 | */ |
39 | private function solWikitextRegexp(): string { |
40 | static $solWikitextRegexp = null; |
41 | if ( $solWikitextRegexp === null ) { |
42 | $sol = PHPUtils::reStrip( |
43 | $this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp( true ), |
44 | '@' |
45 | ); |
46 | $solWikitextRegexp = '@' . |
47 | '^(' . $sol . ')' . |
48 | '([\ \*#:;{\|!=].*)$' . |
49 | '@D'; |
50 | } |
51 | return $solWikitextRegexp; |
52 | } |
53 | |
54 | /** |
55 | * Regexp for checking whether we are at the start of the line (modulo comments and |
56 | * other ignored elements). |
57 | * |
58 | * @return string |
59 | */ |
60 | private function solRegexp(): string { |
61 | static $solRegexp = null; |
62 | if ( $solRegexp === null ) { |
63 | $sol = PHPUtils::reStrip( |
64 | $this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp( true ), |
65 | '@' |
66 | ); |
67 | $solRegexp = '@(^|\n)' . $sol . '$@D'; |
68 | } |
69 | return $solRegexp; |
70 | } |
71 | |
72 | /** |
73 | * Separator information: |
74 | * - constraints (array<array|int>|null): min/max number of newlines |
75 | * - src (string|null): collected separator text from DOM text/comment nodes |
76 | * - lastSourceNode (?Node): Seems to be bookkeeping to make sure we don't reuse |
77 | * original separators when `emitChunk` is called |
78 | * consecutively on the same node. However, it also |
79 | * differs from `state.prevNode` in that it only gets |
80 | * updated when a node calls `emitChunk` so that nodes |
81 | * serializing `justChildren` don't mix up `buildSep`. |
82 | * FIXME: could use a dedicated class |
83 | * @var stdClass |
84 | */ |
85 | public $sep; |
86 | |
87 | /** |
88 | * Is the serializer at the start of a new wikitext line? |
89 | * @var bool |
90 | */ |
91 | public $onSOL = true; |
92 | |
93 | /** |
94 | * True when wts kicks off, false after the first char has been output |
95 | * SSS FIXME: Can this be done away with in some way? |
96 | * @var bool |
97 | */ |
98 | public $atStartOfOutput = true; |
99 | |
100 | /** |
101 | * Is the serializer currently handling link content (children of `<a>`)? |
102 | * @var bool |
103 | */ |
104 | public $inLink = false; |
105 | |
106 | /** |
107 | * Is the serializer currently handling caption content? |
108 | * @var bool |
109 | */ |
110 | public $inCaption = false; |
111 | |
112 | /** |
113 | * Is the serializer currently handling an indent-pre tag? |
114 | * @var bool |
115 | */ |
116 | public $inIndentPre = false; |
117 | |
118 | /** |
119 | * Is the serializer currently handling a html-pre tag? |
120 | * @var bool |
121 | */ |
122 | public $inHTMLPre = false; |
123 | |
124 | /** |
125 | * Is the serializer currently handling a tag that the PHP parser |
126 | * treats as a block tag? |
127 | * @var bool |
128 | */ |
129 | public $inPHPBlock = false; |
130 | |
131 | /** |
132 | * Is the serializer being invoked recursively to serialize a |
133 | * template-generated attribute (via `WSP.getAttributeValue`'s |
134 | * template handling). If so, we should suppress some |
135 | * serialization escapes, like autolink protection, since |
136 | * these are not valid for attribute values. |
137 | * @var bool |
138 | */ |
139 | public $inAttribute = false; |
140 | |
141 | /** |
142 | * Is the serializer currently processing a subtree that has been |
143 | * marked inserted compared to original content (ex: via VE / CX)? |
144 | * |
145 | * @var bool |
146 | */ |
147 | public $inInsertedContent; |
148 | |
149 | /** |
150 | * Did we introduce nowikis for indent-pre protection? |
151 | * If yes, we might run a post-pass to strip useless ones. |
152 | * @var bool |
153 | */ |
154 | public $hasIndentPreNowikis = false; |
155 | |
156 | /** |
157 | * Did we introduce nowikis to preserve quote semantics? |
158 | * If yes, we might run a post-pass to strip useless ones. |
159 | * @var bool |
160 | */ |
161 | public $hasQuoteNowikis = false; |
162 | |
163 | /** |
164 | * Did we introduce `<nowiki />`s? |
165 | * If yes, we do a postpass to remove unnecessary trailing ones. |
166 | * @var bool |
167 | */ |
168 | public $hasSelfClosingNowikis = false; |
169 | |
170 | /** |
171 | * Did we introduce nowikis around `=.*=` text? |
172 | * If yes, we do a postpass to remove unnecessary escapes. |
173 | * @var bool |
174 | */ |
175 | public $hasHeadingEscapes = false; |
176 | |
177 | /** |
178 | * Records the nesting level of wikitext tables |
179 | * @var int |
180 | */ |
181 | public $wikiTableNesting = 0; |
182 | |
183 | /** |
184 | * Stack of wikitext escaping handlers -- these handlers are responsible |
185 | * for smart escaping when the surrounding wikitext context is known. |
186 | * @var callable[] See {@link serializeChildren()} |
187 | */ |
188 | public $wteHandlerStack = []; |
189 | |
190 | /** |
191 | * This array is used by the wikitext escaping algorithm -- represents |
192 | * a "single line" of output wikitext as represented by a block node in |
193 | * the DOM. |
194 | * - firstNode (?Node): first DOM node processed on this line |
195 | * - text (string): output so far from all nodes on the current line |
196 | * - chunks (ConstrainedText[]): list of chunks comprising the current line |
197 | * @var stdClass |
198 | * XXX: replace with output buffering per line |
199 | * FIXME: could use a dedicated class |
200 | */ |
201 | public $currLine; |
202 | |
203 | /** |
204 | * Stack used to enforce single-line context |
205 | * @var SingleLineContext |
206 | */ |
207 | public $singleLineContext; |
208 | |
209 | /** |
210 | * Text to be emitted at the start of file, for redirects |
211 | * @var string|null |
212 | */ |
213 | public $redirectText = null; |
214 | |
215 | /** @var WikitextSerializer */ |
216 | public $serializer; |
217 | |
218 | /** @var ParsoidExtensionAPI */ |
219 | public $extApi; |
220 | |
221 | /** @var string The serialized output */ |
222 | public $out = ''; |
223 | |
224 | /** |
225 | * Whether to use heuristics to determine if a list item, heading, table cell, etc. |
226 | * should have whitespace inserted after the "*#=|!" wikitext chars? This is normally |
227 | * true by default, but not so if HTML content version is older than 1.7.0. |
228 | * In practice, we are now at version 2.1, but Flow stores HTML, so till Flow migrates |
229 | * all its content over to a later version, we need a boolean flag. |
230 | * @var bool |
231 | */ |
232 | public $useWhitespaceHeuristics; |
233 | |
234 | /** |
235 | * Are we in selective serialization mode? |
236 | * @see SelectiveSerializer |
237 | * @var bool |
238 | */ |
239 | public $selserMode; |
240 | |
241 | private ?SelectiveUpdateData $selserData; |
242 | |
243 | /** |
244 | * If in selser mode, while processing a node, do we know if |
245 | * its previous node has not been modified in an edit? |
246 | * @var bool |
247 | */ |
248 | public $prevNodeUnmodified; |
249 | |
250 | /** |
251 | * If in selser mode, while processing a node, do we know if |
252 | * it has not been modified in an edit? |
253 | * @var bool |
254 | */ |
255 | public $currNodeUnmodified; |
256 | |
257 | /** |
258 | * Should we run the wikitext escaping code on the wikitext chunk |
259 | * that will be emitted? |
260 | * @var bool |
261 | */ |
262 | public $needsEscaping = false; |
263 | |
264 | /** |
265 | * Used as fast patch for special protected characters in WikitextEscapeHandlers and |
266 | * comes from LanguageVariantHandler |
267 | * @var string|null |
268 | */ |
269 | public $protect; |
270 | |
271 | /** @var Separators */ |
272 | public $separators; |
273 | |
274 | /** @var Env */ |
275 | private $env; |
276 | |
277 | /** @var Element */ |
278 | public $currNode; |
279 | |
280 | /** @var Element */ |
281 | private $prevNode; |
282 | |
283 | /** @var array */ |
284 | public $openAnnotations; |
285 | |
286 | /** |
287 | * Log prefix to use in trace output |
288 | * @var string |
289 | */ |
290 | private $logPrefix = 'OUT:'; |
291 | |
292 | public $haveTrimmedWsDSR = false; |
293 | |
294 | /** |
295 | * @param WikitextSerializer $serializer |
296 | * @param array $options List of options for serialization: |
297 | * - onSOL: (bool) |
298 | * - inPHPBlock: (bool) |
299 | * - inAttribute: (bool) |
300 | * - protect: (string) |
301 | * - selserData: (SelectiveUpdateData) |
302 | */ |
303 | public function __construct( WikitextSerializer $serializer, array $options = [] ) { |
304 | $this->env = $serializer->env; |
305 | $this->serializer = $serializer; |
306 | $this->extApi = new ParsoidExtensionAPI( $this->env, [ 'html2wt' => [ 'state' => $this ] ] ); |
307 | $this->onSOL = $options['onSOL'] ?? $this->onSOL; |
308 | $this->inPHPBlock = $options['inPHPBlock'] ?? $this->inPHPBlock; |
309 | $this->inAttribute = $options['inAttribute'] ?? $this->inAttribute; |
310 | $this->protect = $options['protect'] ?? null; |
311 | $this->selserData = $options['selserData'] ?? null; |
312 | $this->resetCurrLine( null ); |
313 | $this->singleLineContext = new SingleLineContext(); |
314 | $this->resetSep(); |
315 | $this->haveTrimmedWsDSR = Semver::satisfies( $this->env->getInputContentVersion(), '>=2.1.1' ); |
316 | $this->separators = new Separators( $this->env, $this ); |
317 | } |
318 | |
319 | /** |
320 | * @note Porting note: this replaces direct access |
321 | * @return Env |
322 | */ |
323 | public function getEnv(): Env { |
324 | return $this->env; |
325 | } |
326 | |
327 | /** |
328 | * Initialize a few boolean flags based on serialization mode. |
329 | * FIXME: Ideally, this should be private. Requires shuffing around |
330 | * where SerializerState is constructed so that $selserMode is known |
331 | * at the time of construction. |
332 | * @private for use by WikitextSerializer only |
333 | * @param bool $selserMode Are we running selective serialization? |
334 | */ |
335 | public function initMode( bool $selserMode ): void { |
336 | $this->useWhitespaceHeuristics = |
337 | Semver::satisfies( $this->env->getInputContentVersion(), '>=1.7.0' ); |
338 | $this->selserMode = $selserMode; |
339 | } |
340 | |
341 | /** |
342 | * Appends the separator source to the separator src buffer. |
343 | * Don't update $state->onSOL since this string hasn't been emitted yet. |
344 | * If content handlers change behavior based on whether this newline will |
345 | * be emitted or not, they should peek into this buffer (ex: see TDHandler |
346 | * and THHandler code). |
347 | * |
348 | * @param string $src |
349 | */ |
350 | public function appendSep( string $src ): void { |
351 | $this->sep->src = ( $this->sep->src ?? '' ) . $src; |
352 | } |
353 | |
354 | /** |
355 | * Cycle the state after processing a node. |
356 | * @param Node $node |
357 | */ |
358 | public function updateSep( Node $node ): void { |
359 | $this->sep->lastSourceNode = $node; |
360 | } |
361 | |
362 | private function resetSep() { |
363 | $this->sep = (object)[ |
364 | 'constraints' => null, |
365 | 'src' => null, |
366 | 'lastSourceNode' => null, |
367 | ]; |
368 | } |
369 | |
370 | /** |
371 | * Reset the current line state. |
372 | * @param ?Node $node |
373 | */ |
374 | private function resetCurrLine( ?Node $node ): void { |
375 | $this->currLine = (object)[ |
376 | 'text' => '', |
377 | 'chunks' => [], |
378 | 'firstNode' => $node |
379 | ]; |
380 | } |
381 | |
382 | /** |
383 | * Process and emit a line of ConstrainedText chunks, adjusting chunk boundaries as necessary. |
384 | * (Start of line and end of line are always safe for ConstrainedText chunks, so we don't need |
385 | * to buffer more than the last line.) |
386 | */ |
387 | private function flushLine(): void { |
388 | $this->out .= ConstrainedText::escapeLine( $this->currLine->chunks ); |
389 | $this->currLine->chunks = []; |
390 | } |
391 | |
392 | /** |
393 | * Extracts a subset of the page source bound by the supplied source range. |
394 | * @param SourceRange $sr |
395 | * @return string|null |
396 | */ |
397 | public function getOrigSrc( SourceRange $sr ): ?string { |
398 | Assert::invariant( $this->selserMode, 'SerializerState::$selserMode must be set' ); |
399 | if ( |
400 | $sr->start <= $sr->end && |
401 | // FIXME: Having a $start greater than the source length is |
402 | // probably a canary for corruption. Maybe we should be throwing |
403 | // here instead. See T240053. |
404 | // But, see comment in UnpackDOMFragments where we very very rarely |
405 | // can deliberately set DSR to point outside page source. |
406 | $sr->start <= strlen( $this->selserData->revText ) |
407 | ) { |
408 | // XXX should use $frame->getSrcText() like WTUtils::getWTSource |
409 | return $sr->substr( $this->selserData->revText ); |
410 | } else { |
411 | return null; |
412 | } |
413 | } |
414 | |
415 | /** |
416 | * Check the validity of a DSR in the context of the page source. |
417 | * |
418 | * Returns false if Utils::isValidDSR() would return false, but also |
419 | * returns false if the DSR offsets would create a bad UTF-8 string |
420 | * (ie, the start offsets don't point to a valid UTF-8 start character). |
421 | * @param ?DomSourceRange $dsr DSR source range values |
422 | * @param bool $all Also check the widths of the container tag |
423 | * @return bool |
424 | */ |
425 | public function isValidDSR( ?DomSourceRange $dsr, bool $all = false ) { |
426 | if ( !Utils::isValidDSR( $dsr, $all ) ) { |
427 | return false; |
428 | } |
429 | if ( !( $dsr->start <= $dsr->end && |
430 | $dsr->end <= strlen( $this->selserData->revText ) ) ) { |
431 | return false; |
432 | } |
433 | // check the UTF-8 ranges. |
434 | $src = $this->selserData->revText; |
435 | $check = static function ( $start, $end ) use ( $src ) { |
436 | if ( $start === $end ) { |
437 | // zero-length string is always ok |
438 | return true; |
439 | } |
440 | $firstChar = ord( $src[$start] ); |
441 | if ( ( $firstChar & 0xC0 ) === 0x80 ) { |
442 | return false; // bad UTF-8 at start of string |
443 | } |
444 | $i = 0; |
445 | // This next loop won't pass $start because we've already |
446 | // asserted that the first character isn't 10xx xxxx |
447 | do { |
448 | $i--; |
449 | if ( $i <= -5 ) { |
450 | return false; // bad UTF-8 at end of string (>4 byte sequence) |
451 | } |
452 | $lastChar = ord( $src[$end + $i] ); |
453 | } while ( ( $lastChar & 0xC0 ) === 0x80 ); |
454 | if ( ( $lastChar & 0x80 ) === 0 ) { |
455 | return $i === -1; |
456 | } elseif ( ( $lastChar & 0xE0 ) === 0xC0 ) { |
457 | return $i === -2; |
458 | } elseif ( ( $lastChar & 0xF0 ) === 0xE0 ) { |
459 | return $i === -3; |
460 | } elseif ( ( $lastChar & 0xF8 ) === 0xF0 ) { |
461 | return $i === -4; |
462 | } else { |
463 | return false; |
464 | } |
465 | }; |
466 | if ( !$all ) { |
467 | return $check( $dsr->start, $dsr->end ); |
468 | } |
469 | // Check each inner ranges. |
470 | $openEnd = $dsr->start + $dsr->openWidth; |
471 | if ( $openEnd > $dsr->end ) { |
472 | return false; |
473 | } |
474 | if ( !$check( $dsr->start, $openEnd ) ) { |
475 | return false; |
476 | } |
477 | $closeStart = $dsr->end - $dsr->closeWidth; |
478 | if ( $dsr->start > $closeStart ) { |
479 | return false; |
480 | } |
481 | if ( !$check( $closeStart, $dsr->end ) ) { |
482 | return false; |
483 | } |
484 | if ( $openEnd > $closeStart ) { |
485 | return false; |
486 | } |
487 | if ( !$check( $openEnd, $closeStart ) ) { |
488 | return false; |
489 | } |
490 | return true; |
491 | } |
492 | |
493 | /** |
494 | * Like it says on the tin. |
495 | * @param Node $node |
496 | */ |
497 | public function updateModificationFlags( Node $node ): void { |
498 | $this->prevNodeUnmodified = $this->currNodeUnmodified; |
499 | $this->currNodeUnmodified = false; |
500 | $this->prevNode = $node; |
501 | } |
502 | |
503 | /** |
504 | * Separators put us in SOL state. |
505 | * @param string $sep |
506 | * @param Node $node |
507 | */ |
508 | private function sepIntroducedSOL( string $sep, Node $node ): void { |
509 | // Don't get tripped by newlines in comments! Be wary of nowikis added |
510 | // by makeSepIndentPreSafe on the last line. |
511 | $nonCommentSep = preg_replace( Utils::COMMENT_REGEXP, '', $sep ); |
512 | if ( substr( $nonCommentSep, -1 ) === "\n" ) { |
513 | $this->onSOL = true; |
514 | } |
515 | |
516 | if ( str_contains( $nonCommentSep, "\n" ) ) { |
517 | // process escapes in our full line |
518 | $this->flushLine(); |
519 | $this->resetCurrLine( $node ); |
520 | } |
521 | } |
522 | |
523 | /** |
524 | * Accumulates chunks on the current line. |
525 | * @param ConstrainedText $chunk |
526 | * @param string $logPrefix |
527 | */ |
528 | private function pushToCurrLine( ConstrainedText $chunk, string $logPrefix ) { |
529 | // Emitting text that has not been escaped |
530 | $this->currLine->text .= $chunk->text; |
531 | |
532 | $this->currLine->chunks[] = $chunk; |
533 | |
534 | $this->serializer->trace( '--->', $logPrefix, static function () use ( $chunk ) { |
535 | return PHPUtils::jsonEncode( $chunk->text ); |
536 | } ); |
537 | } |
538 | |
539 | /** |
540 | * Pushes the separator to the current line and resets the separator state. |
541 | * @param string $sep |
542 | * @param Node $node |
543 | * @param string $debugPrefix |
544 | */ |
545 | private function emitSep( string $sep, Node $node, string $debugPrefix ): void { |
546 | $sep = ConstrainedText::cast( $sep, $node ); |
547 | |
548 | // Replace newlines if we're in a single-line context |
549 | if ( $this->singleLineContext->enforced() ) { |
550 | $sep->text = preg_replace( '/\n/', ' ', $sep->text ); |
551 | } |
552 | |
553 | $this->pushToCurrLine( $sep, $debugPrefix ); |
554 | $this->sepIntroducedSOL( $sep->text, $node ); |
555 | |
556 | // Reset separator state |
557 | $this->resetSep(); |
558 | $this->updateSep( $node ); |
559 | } |
560 | |
561 | /** |
562 | * Determines if we can use the original separator for this node or if we |
563 | * need to build one based on its constraints, and then emits it. |
564 | * |
565 | * @param Node $node |
566 | */ |
567 | private function emitSepForNode( Node $node ): void { |
568 | /* When block nodes are deleted, the deletion affects whether unmodified |
569 | * newline separators between a pair of unmodified P tags can be reused. |
570 | * |
571 | * Example: |
572 | * ``` |
573 | * Original WT : "<div>x</div>foo\nbar" |
574 | * Original HTML: "<div>x</div><p>foo</p>\n<p>bar</p>" |
575 | * Edited HTML : "<p>foo</p>\n<p>bar</p>" |
576 | * Annotated DOM: "<mw:DiffMarker is-block><p>foo</p>\n<p>bar</p>" |
577 | * Expected WT : "foo\n\nbar" |
578 | * ``` |
579 | * |
580 | * Note the additional newline between "foo" and "bar" even though originally, |
581 | * there was just a single newline. |
582 | * |
583 | * So, even though the two P tags and the separator between them is |
584 | * unmodified, it is insufficient to rely on just that. We have to look at |
585 | * what has happened on the two wikitext lines onto which the two P tags |
586 | * will get serialized. |
587 | * |
588 | * Now, if you check the code for `nextToDeletedBlockNodeInWT`, that code is |
589 | * not really looking at ALL the nodes before/after the nodes that could |
590 | * serialize onto the wikitext lines. It is looking at the immediately |
591 | * adjacent nodes, i.e. it is not necessary to look if a block-tag was |
592 | * deleted 2 or 5 siblings away. If we had to actually examine all of those, |
593 | * nodes, this would get very complex, and it would be much simpler to just |
594 | * discard the original separators => potentially lots of dirty diffs. |
595 | * |
596 | * To understand why it is sufficient (for correctness) to examine just |
597 | * the immediately adjacent nodes, let us look at an additional example. |
598 | * ``` |
599 | * Original WT : "a<div>b</div>c<div>d</div>e\nf" |
600 | * Original HTML: "<p>a</p><div>b</div><p>c</p><div>d</div><p>e</p>\n<p>f</p>" |
601 | * ``` |
602 | * Note how `<block>` tags and `<p>` tags interleave in the HTML. This would be |
603 | * the case always no matter how much inline content showed up between the |
604 | * block tags in wikitext. If the b-`<div>` was deleted, we don't care |
605 | * about it, since we still have the d-`<div>` before the P tag that preserves |
606 | * the correctness of the single `"\n"` separator. If the d-`<div>` was deleted, |
607 | * we conservatively ignore the original separator and let normal P-P constraints |
608 | * take care of it. At worst, we might generate a dirty diff in this scenario. */ |
609 | $origSepNeeded = ( $node !== $this->sep->lastSourceNode ); |
610 | $origSepUsable = $origSepNeeded && |
611 | ( |
612 | // first-content-node of <body> ($this->prevNode) |
613 | ( |
614 | DOMUtils::isBody( $this->prevNode ) && |
615 | $node->parentNode === $this->prevNode |
616 | ) |
617 | || |
618 | // unmodified sibling node of $this->prevNode |
619 | ( |
620 | $this->prevNode && $this->prevNodeUnmodified && |
621 | $node->parentNode === $this->prevNode->parentNode && |
622 | !WTSUtils::nextToDeletedBlockNodeInWT( $this->prevNode, true ) |
623 | ) |
624 | ) && |
625 | $this->currNodeUnmodified && !WTSUtils::nextToDeletedBlockNodeInWT( $node, false ); |
626 | |
627 | $origSep = null; |
628 | if ( $origSepUsable ) { |
629 | if ( $this->prevNode instanceof Element && $node instanceof Element ) { |
630 | '@phan-var Element $node';/** @var Element $node */ |
631 | if ( DOMUtils::isBody( $this->prevNode ) ) { |
632 | // <body> won't have DSR in body_only scenarios |
633 | $sr = new SourceRange( 0, 0 ); |
634 | } else { |
635 | $sr = DOMDataUtils::getDataParsoid( $this->prevNode )->dsr; |
636 | } |
637 | $sr = $sr->to( DOMDataUtils::getDataParsoid( $node )->dsr ); |
638 | $origSep = $this->getOrigSrc( $sr ); |
639 | } elseif ( $this->sep->src && WTSUtils::isValidSep( $this->sep->src ) ) { |
640 | // We don't know where '$this->sep->src' comes from. So, reuse it |
641 | // only if it is a valid separator string. |
642 | $origSep = $this->sep->src; |
643 | } |
644 | } |
645 | |
646 | if ( $origSep !== null ) { |
647 | $this->emitSep( $origSep, $node, 'ORIG-SEP:' ); |
648 | } else { |
649 | $sep = $this->separators->buildSep( $node ); |
650 | $this->emitSep( $sep ?? '', $node, 'SEP:' ); |
651 | } |
652 | } |
653 | |
654 | /** |
655 | * Recovers and emits any trimmed whitespace for $node |
656 | * @param Node $node |
657 | * @param bool $leading |
658 | * if true, trimmed leading whitespace is emitted |
659 | * if false, trimmed railing whitespace is emitted |
660 | * @return string|null |
661 | */ |
662 | public function recoverTrimmedWhitespace( Node $node, bool $leading ): ?string { |
663 | $sep = $this->separators->recoverTrimmedWhitespace( $node, $leading ); |
664 | $this->serializer->trace( '--->', "TRIMMED-SEP:", static function () use ( $sep ) { |
665 | return PHPUtils::jsonEncode( $sep ); |
666 | } ); |
667 | return $sep; |
668 | } |
669 | |
670 | /** |
671 | * Pushes the chunk to the current line. |
672 | * @param ConstrainedText|string $res |
673 | * @param Node $node |
674 | */ |
675 | public function emitChunk( $res, Node $node ): void { |
676 | $res = ConstrainedText::cast( $res, $node ); |
677 | |
678 | // Replace newlines if we're in a single-line context |
679 | if ( $this->singleLineContext->enforced() ) { |
680 | $res->text = str_replace( "\n", ' ', $res->text ); |
681 | } |
682 | |
683 | // Emit separator first |
684 | if ( $res->noSep ) { |
685 | /* skip separators for internal tokens from SelSer */ |
686 | if ( $this->onSOL ) { |
687 | // process escapes in our full line |
688 | $this->flushLine(); |
689 | $this->resetCurrLine( $node ); |
690 | } |
691 | } else { |
692 | $this->emitSepForNode( $node ); |
693 | } |
694 | |
695 | $needsEscaping = $this->needsEscaping; |
696 | if ( $needsEscaping && $this->currNode instanceof Text ) { |
697 | $needsEscaping = !$this->inHTMLPre && ( $this->onSOL || !$this->currNodeUnmodified ); |
698 | } |
699 | |
700 | // Escape 'res' if necessary |
701 | if ( $needsEscaping ) { |
702 | $res = new ConstrainedText( [ |
703 | 'text' => $this->serializer->escapeWikitext( $this, $res->text, [ |
704 | 'node' => $node, |
705 | 'isLastChild' => DiffDOMUtils::nextNonDeletedSibling( $node ) === null, |
706 | ] ), |
707 | 'prefix' => $res->prefix, |
708 | 'suffix' => $res->suffix, |
709 | 'node' => $res->node, |
710 | ] ); |
711 | $this->needsEscaping = false; |
712 | } else { |
713 | // If 'res' is coming from selser and the current node is a paragraph tag, |
714 | // check if 'res' might need some leading chars nowiki-escaped before being output. |
715 | // Because of block-tag p-wrapping behavior, sol-sensitive characters that used to |
716 | // be in non-sol positions, but yet wrapped in p-tags, could end up in sol-position |
717 | // if those block tags get deleted during edits. |
718 | // |
719 | // Ex: a<div>foo</div>*b |
720 | // -- wt2html --> <p>a</p><div>foo<div><p>*b</p> |
721 | // -- EDIT --> <p>a</p><p>*b</p> |
722 | // -- html2wt --> a\n\n<nowiki>*</nowiki>b |
723 | // |
724 | // In this scenario, the <p>a</p>, <p>*b</p>, and <p>#c</p> |
725 | // will be marked unmodified and will be processed below. |
726 | if ( $this->selserMode |
727 | && $this->onSOL |
728 | && $this->currNodeUnmodified |
729 | // 'node' came from original Parsoid HTML unmodified. So, if its content |
730 | // needs nowiki-escaping, we know that the reason it didn't parse into |
731 | // lists/headings/whatever is because it didn't occur at the start of the |
732 | // line => it had a block-tag in the original wikitext. So if the previous |
733 | // node was also unmodified (and since it also came from original Parsoid |
734 | // HTML), we can safely infer that it couldn't have been an inline node or |
735 | // a P-tag (if it were, the p-wrapping code would have swallowed that content |
736 | // into 'node'). So, it would have to be some sort of block tag => this.onSOL |
737 | // couldn't have been true (because we could have serialized 'node' on the |
738 | // same line as the block tag) => we can save some effort by eliminating |
739 | // scenarios where 'this.prevNodeUnmodified' is true. |
740 | && !$this->prevNodeUnmodified |
741 | && DOMCompat::nodeName( $node ) === 'p' && !WTUtils::isLiteralHTMLNode( $node ) |
742 | ) { |
743 | $pChild = DiffDOMUtils::firstNonSepChild( $node ); |
744 | // If a text node, we have to make sure that the text doesn't |
745 | // get reparsed as non-text in the wt2html pipeline. |
746 | if ( $pChild instanceof Text ) { |
747 | $match = $res->matches( $this->solWikitextRegexp(), $this->env ); |
748 | if ( $match && isset( $match[2] ) ) { |
749 | if ( preg_match( '/^([\*#:;]|{\||.*=$)/D', $match[2] ) |
750 | // ! and | chars are harmless outside tables |
751 | || ( strspn( $match[2], '|!' ) && $this->wikiTableNesting > 0 ) |
752 | // indent-pres are suppressed inside <blockquote> |
753 | || ( preg_match( '/^ \S/', $match[2] ) |
754 | && !DOMUtils::hasNameOrHasAncestorOfName( $node, 'blockquote' ) ) |
755 | ) { |
756 | $res = ConstrainedText::cast( ( $match[1] ?: '' ) |
757 | . '<nowiki>' . substr( $match[2], 0, 1 ) . '</nowiki>' |
758 | . substr( $match[2], 1 ), $node ); |
759 | } |
760 | } |
761 | } |
762 | } |
763 | } |
764 | |
765 | // Output res |
766 | $this->pushToCurrLine( $res, $this->logPrefix ); |
767 | |
768 | // Update sol flag. Test for newlines followed by optional includeonly or comments |
769 | if ( !$res->matches( $this->solRegexp(), $this->env ) ) { |
770 | $this->onSOL = false; |
771 | } |
772 | |
773 | // We've emit something so we're no longer at SOO. |
774 | $this->atStartOfOutput = false; |
775 | } |
776 | |
777 | /** |
778 | * Serialize the children of a DOM node, sharing the global serializer state. |
779 | * Typically called by a DOM-based handler to continue handling its children. |
780 | * @param Element|DocumentFragment $node |
781 | * @param ?callable $wtEscaper ( $state, $text, $opts ) |
782 | * PORT-FIXME document better; should this be done via WikitextEscapeHandlers somehow? |
783 | * @param ?Node $firstChild |
784 | */ |
785 | public function serializeChildren( |
786 | Node $node, ?callable $wtEscaper = null, ?Node $firstChild = null |
787 | ): void { |
788 | // SSS FIXME: Unsure if this is the right thing always |
789 | if ( $wtEscaper ) { |
790 | $this->wteHandlerStack[] = $wtEscaper; |
791 | } |
792 | |
793 | $child = $firstChild ?: $node->firstChild; |
794 | while ( $child !== null ) { |
795 | // We always get the next child to process |
796 | $child = $this->serializer->serializeNode( $child ); |
797 | } |
798 | |
799 | if ( $wtEscaper ) { |
800 | array_pop( $this->wteHandlerStack ); |
801 | } |
802 | |
803 | // If we serialized children explicitly, |
804 | // we were obviously processing a modified node. |
805 | $this->currNodeUnmodified = false; |
806 | } |
807 | |
808 | /** |
809 | * Abstracts some steps taken in `serializeChildrenToString` and `serializeDOM` |
810 | * |
811 | * @param Element|DocumentFragment $node |
812 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
813 | * @internal For use by WikitextSerializer only |
814 | */ |
815 | public function kickOffSerialize( |
816 | Node $node, ?callable $wtEscaper = null |
817 | ): void { |
818 | $this->updateSep( $node ); |
819 | $this->currNodeUnmodified = false; |
820 | $this->updateModificationFlags( $node ); |
821 | $this->resetCurrLine( $node->firstChild ); |
822 | $this->serializeChildren( $node, $wtEscaper ); |
823 | // Emit child-parent seps. |
824 | $this->emitSepForNode( $node ); |
825 | // We've reached EOF, flush the remaining buffered text. |
826 | $this->flushLine(); |
827 | } |
828 | |
829 | /** |
830 | * Serialize children to a string |
831 | * |
832 | * FIXME(arlorla): Shouldn't affect the separator state, but accidents have |
833 | * have been known to happen. T109793 suggests using its own wts / state. |
834 | * |
835 | * @param Element|DocumentFragment $node |
836 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
837 | * @param string $inState |
838 | * @return string |
839 | */ |
840 | private function serializeChildrenToString( |
841 | Node $node, ?callable $wtEscaper, string $inState |
842 | ): string { |
843 | $states = [ 'inLink', 'inCaption', 'inIndentPre', 'inHTMLPre', 'inPHPBlock', 'inAttribute' ]; |
844 | Assert::parameter( in_array( $inState, $states, true ), '$inState', 'Must be one of: ' |
845 | . implode( ', ', $states ) ); |
846 | // FIXME: Make sure that the separators emitted here conform to the |
847 | // syntactic constraints of syntactic context. |
848 | $oldSep = $this->sep; |
849 | $oldSOL = $this->onSOL; |
850 | $oldOut = $this->out; |
851 | $oldStart = $this->atStartOfOutput; |
852 | $oldCurrLine = $this->currLine; |
853 | $oldLogPrefix = $this->logPrefix; |
854 | // Modification flags |
855 | $oldPrevNodeUnmodified = $this->prevNodeUnmodified; |
856 | $oldCurrNodeUnmodified = $this->currNodeUnmodified; |
857 | $oldPrevNode = $this->prevNode; |
858 | |
859 | $this->out = ''; |
860 | $this->logPrefix = 'OUT(C):'; |
861 | $this->resetSep(); |
862 | $this->onSOL = false; |
863 | $this->atStartOfOutput = false; |
864 | $this->$inState = true; |
865 | |
866 | $this->singleLineContext->disable(); |
867 | $this->kickOffSerialize( $node, $wtEscaper ); |
868 | $this->singleLineContext->pop(); |
869 | |
870 | // restore the state |
871 | $bits = $this->out; |
872 | $this->out = $oldOut; |
873 | $this->$inState = false; |
874 | $this->sep = $oldSep; |
875 | $this->onSOL = $oldSOL; |
876 | $this->atStartOfOutput = $oldStart; |
877 | $this->currLine = $oldCurrLine; |
878 | $this->logPrefix = $oldLogPrefix; |
879 | // Modification flags |
880 | $this->prevNodeUnmodified = $oldPrevNodeUnmodified; |
881 | $this->currNodeUnmodified = $oldCurrNodeUnmodified; |
882 | $this->prevNode = $oldPrevNode; |
883 | return $bits; |
884 | } |
885 | |
886 | /** |
887 | * Serialize children of a link to a string |
888 | * @param Element|DocumentFragment $node |
889 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
890 | * @return string |
891 | */ |
892 | public function serializeLinkChildrenToString( |
893 | Node $node, ?callable $wtEscaper = null |
894 | ): string { |
895 | return $this->serializeChildrenToString( $node, $wtEscaper, 'inLink' ); |
896 | } |
897 | |
898 | /** |
899 | * Serialize children of a caption to a string |
900 | * @param Element|DocumentFragment $node |
901 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
902 | * @return string |
903 | */ |
904 | public function serializeCaptionChildrenToString( |
905 | Node $node, ?callable $wtEscaper = null |
906 | ): string { |
907 | return $this->serializeChildrenToString( $node, $wtEscaper, 'inCaption' ); |
908 | } |
909 | |
910 | /** |
911 | * Serialize children of an indent-pre to a string |
912 | * @param Element|DocumentFragment $node |
913 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
914 | * @return string |
915 | */ |
916 | public function serializeIndentPreChildrenToString( |
917 | Node $node, ?callable $wtEscaper = null |
918 | ): string { |
919 | return $this->serializeChildrenToString( $node, $wtEscaper, 'inIndentPre' ); |
920 | } |
921 | |
922 | /** |
923 | * Take notes of the open annotation ranges and whether they have been extended. |
924 | * @param string $ann |
925 | * @param bool $extended |
926 | */ |
927 | public function openAnnotationRange( string $ann, bool $extended ) { |
928 | $this->openAnnotations[$ann] = $extended; |
929 | } |
930 | |
931 | /** |
932 | * Removes the corresponding annotation range from the list of open ranges. |
933 | * @param string $ann |
934 | */ |
935 | public function closeAnnotationRange( string $ann ) { |
936 | unset( $this->openAnnotations[$ann] ); |
937 | } |
938 | |
939 | } |