Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
29.13% |
60 / 206 |
|
38.46% |
10 / 26 |
CRAP | |
0.00% |
0 / 1 |
SerializerState | |
29.13% |
60 / 206 |
|
38.46% |
10 / 26 |
2243.94 | |
0.00% |
0 / 1 |
solWikitextRegexp | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
6 | |||
solRegexp | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
__construct | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
1 | |||
getEnv | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
initMode | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
appendSep | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
updateSep | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
resetSep | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
resetCurrLine | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
flushLine | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getOrigSrc | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
3.07 | |||
updateModificationFlags | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
sepIntroducedSOL | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
pushToCurrLine | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
emitSep | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
emitSepForNode | |
0.00% |
0 / 30 |
|
0.00% |
0 / 1 |
306 | |||
recoverTrimmedWhitespace | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
emitChunk | |
40.48% |
17 / 42 |
|
0.00% |
0 / 1 |
156.81 | |||
serializeChildren | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
5 | |||
kickOffSerialize | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
serializeChildrenToString | |
0.00% |
0 / 33 |
|
0.00% |
0 / 1 |
2 | |||
serializeLinkChildrenToString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
serializeCaptionChildrenToString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
serializeIndentPreChildrenToString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
openAnnotationRange | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
closeAnnotationRange | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Composer\Semver\Semver; |
7 | use stdClass; |
8 | use Wikimedia\Assert\Assert; |
9 | use Wikimedia\Parsoid\Config\Env; |
10 | use Wikimedia\Parsoid\Core\SelserData; |
11 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
12 | use Wikimedia\Parsoid\DOM\Element; |
13 | use Wikimedia\Parsoid\DOM\Node; |
14 | use Wikimedia\Parsoid\DOM\Text; |
15 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
16 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ConstrainedText; |
17 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
18 | use Wikimedia\Parsoid\Utils\DOMCompat; |
19 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
20 | use Wikimedia\Parsoid\Utils\DOMUtils; |
21 | use Wikimedia\Parsoid\Utils\PHPUtils; |
22 | use Wikimedia\Parsoid\Utils\Utils; |
23 | use Wikimedia\Parsoid\Utils\WTUtils; |
24 | |
25 | /** |
26 | * State object for the wikitext serializers. |
27 | */ |
28 | class SerializerState { |
29 | |
30 | /** |
31 | * Regexp for checking if what we have consumed wikimarkup that has special meaning at the |
32 | * beginning of the line, and is indeed at the beginning of the line (modulo comments and |
33 | * other ignored elements). |
34 | * |
35 | * @return string |
36 | */ |
37 | private function solWikitextRegexp(): string { |
38 | static $solWikitextRegexp = null; |
39 | if ( $solWikitextRegexp === null ) { |
40 | $sol = PHPUtils::reStrip( |
41 | $this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp( true ), |
42 | '@' |
43 | ); |
44 | $solWikitextRegexp = '@' . |
45 | '^(' . $sol . ')' . |
46 | '([\ \*#:;{\|!=].*)$' . |
47 | '@D'; |
48 | } |
49 | return $solWikitextRegexp; |
50 | } |
51 | |
52 | /** |
53 | * Regexp for checking whether we are at the start of the line (modulo comments and |
54 | * other ignored elements). |
55 | * |
56 | * @return string |
57 | */ |
58 | private function solRegexp(): string { |
59 | static $solRegexp = null; |
60 | if ( $solRegexp === null ) { |
61 | $sol = PHPUtils::reStrip( |
62 | $this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp( true ), |
63 | '@' |
64 | ); |
65 | $solRegexp = '@(^|\n)' . $sol . '$@D'; |
66 | } |
67 | return $solRegexp; |
68 | } |
69 | |
70 | /** |
71 | * Separator information: |
72 | * - constraints (array<array|int>|null): min/max number of newlines |
73 | * - src (string|null): collected separator text from DOM text/comment nodes |
74 | * - lastSourceNode (?Node): Seems to be bookkeeping to make sure we don't reuse |
75 | * original separators when `emitChunk` is called |
76 | * consecutively on the same node. However, it also |
77 | * differs from `state.prevNode` in that it only gets |
78 | * updated when a node calls `emitChunk` so that nodes |
79 | * serializing `justChildren` don't mix up `buildSep`. |
80 | * FIXME: could use a dedicated class |
81 | * @var stdClass |
82 | */ |
83 | public $sep; |
84 | |
85 | /** |
86 | * Is the serializer at the start of a new wikitext line? |
87 | * @var bool |
88 | */ |
89 | public $onSOL = true; |
90 | |
91 | /** |
92 | * True when wts kicks off, false after the first char has been output |
93 | * SSS FIXME: Can this be done away with in some way? |
94 | * @var bool |
95 | */ |
96 | public $atStartOfOutput = true; |
97 | |
98 | /** |
99 | * Is the serializer currently handling link content (children of `<a>`)? |
100 | * @var bool |
101 | */ |
102 | public $inLink = false; |
103 | |
104 | /** |
105 | * Is the serializer currently handling caption content? |
106 | * @var bool |
107 | */ |
108 | public $inCaption = false; |
109 | |
110 | /** |
111 | * Is the serializer currently handling an indent-pre tag? |
112 | * @var bool |
113 | */ |
114 | public $inIndentPre = false; |
115 | |
116 | /** |
117 | * Is the serializer currently handling a html-pre tag? |
118 | * @var bool |
119 | */ |
120 | public $inHTMLPre = false; |
121 | |
122 | /** |
123 | * Is the serializer currently handling a tag that the PHP parser |
124 | * treats as a block tag? |
125 | * @var bool |
126 | */ |
127 | public $inPHPBlock = false; |
128 | |
129 | /** |
130 | * Is the serializer being invoked recursively to serialize a |
131 | * template-generated attribute (via `WSP.getAttributeValue`'s |
132 | * template handling). If so, we should suppress some |
133 | * serialization escapes, like autolink protection, since |
134 | * these are not valid for attribute values. |
135 | * @var bool |
136 | */ |
137 | public $inAttribute = false; |
138 | |
139 | /** |
140 | * Is the serializer currently processing a subtree that has been |
141 | * marked inserted compared to original content (ex: via VE / CX)? |
142 | * |
143 | * @var bool |
144 | */ |
145 | public $inInsertedContent; |
146 | |
147 | /** |
148 | * Did we introduce nowikis for indent-pre protection? |
149 | * If yes, we might run a post-pass to strip useless ones. |
150 | * @var bool |
151 | */ |
152 | public $hasIndentPreNowikis = false; |
153 | |
154 | /** |
155 | * Did we introduce nowikis to preserve quote semantics? |
156 | * If yes, we might run a post-pass to strip useless ones. |
157 | * @var bool |
158 | */ |
159 | public $hasQuoteNowikis = false; |
160 | |
161 | /** |
162 | * Did we introduce `<nowiki />`s? |
163 | * If yes, we do a postpass to remove unnecessary trailing ones. |
164 | * @var bool |
165 | */ |
166 | public $hasSelfClosingNowikis = false; |
167 | |
168 | /** |
169 | * Did we introduce nowikis around `=.*=` text? |
170 | * If yes, we do a postpass to remove unnecessary escapes. |
171 | * @var bool |
172 | */ |
173 | public $hasHeadingEscapes = false; |
174 | |
175 | /** |
176 | * Records the nesting level of wikitext tables |
177 | * @var int |
178 | */ |
179 | public $wikiTableNesting = 0; |
180 | |
181 | /** |
182 | * Stack of wikitext escaping handlers -- these handlers are responsible |
183 | * for smart escaping when the surrounding wikitext context is known. |
184 | * @var callable[] See {@link serializeChildren()} |
185 | */ |
186 | public $wteHandlerStack = []; |
187 | |
188 | /** |
189 | * This array is used by the wikitext escaping algorithm -- represents |
190 | * a "single line" of output wikitext as represented by a block node in |
191 | * the DOM. |
192 | * - firstNode (?Node): first DOM node processed on this line |
193 | * - text (string): output so far from all nodes on the current line |
194 | * - chunks (ConstrainedText[]): list of chunks comprising the current line |
195 | * @var stdClass |
196 | * XXX: replace with output buffering per line |
197 | * FIXME: could use a dedicated class |
198 | */ |
199 | public $currLine; |
200 | |
201 | /** |
202 | * Stack used to enforce single-line context |
203 | * @var SingleLineContext |
204 | */ |
205 | public $singleLineContext; |
206 | |
207 | /** |
208 | * Text to be emitted at the start of file, for redirects |
209 | * @var string|null |
210 | */ |
211 | public $redirectText = null; |
212 | |
213 | /** @var WikitextSerializer */ |
214 | public $serializer; |
215 | |
216 | /** @var ParsoidExtensionAPI */ |
217 | public $extApi; |
218 | |
219 | /** @var string The serialized output */ |
220 | public $out = ''; |
221 | |
222 | /** |
223 | * Whether to use heuristics to determine if a list item, heading, table cell, etc. |
224 | * should have whitespace inserted after the "*#=|!" wikitext chars? This is normally |
225 | * true by default, but not so if HTML content version is older than 1.7.0. |
226 | * In practice, we are now at version 2.1, but Flow stores HTML, so till Flow migrates |
227 | * all its content over to a later version, we need a boolean flag. |
228 | * @var bool |
229 | */ |
230 | public $useWhitespaceHeuristics; |
231 | |
232 | /** |
233 | * Are we in selective serialization mode? |
234 | * @see SelectiveSerializer |
235 | * @var bool |
236 | */ |
237 | public $selserMode; |
238 | |
239 | /** @var SelserData */ |
240 | private $selserData; |
241 | |
242 | /** |
243 | * If in selser mode, while processing a node, do we know if |
244 | * its previous node has not been modified in an edit? |
245 | * @var bool |
246 | */ |
247 | public $prevNodeUnmodified; |
248 | |
249 | /** |
250 | * If in selser mode, while processing a node, do we know if |
251 | * it has not been modified in an edit? |
252 | * @var bool |
253 | */ |
254 | public $currNodeUnmodified; |
255 | |
256 | /** |
257 | * Should we run the wikitext escaping code on the wikitext chunk |
258 | * that will be emitted? |
259 | * @var bool |
260 | */ |
261 | public $needsEscaping = false; |
262 | |
263 | /** |
264 | * Used as fast patch for special protected characters in WikitextEscapeHandlers and |
265 | * comes from LanguageVariantHandler |
266 | * @var string|null |
267 | */ |
268 | public $protect; |
269 | |
270 | /** @var Separators */ |
271 | public $separators; |
272 | |
273 | /** @var Env */ |
274 | private $env; |
275 | |
276 | /** @var Element */ |
277 | public $currNode; |
278 | |
279 | /** @var Element */ |
280 | private $prevNode; |
281 | |
282 | /** @var array */ |
283 | public $openAnnotations; |
284 | |
285 | /** |
286 | * Log prefix to use in trace output |
287 | * @var string |
288 | */ |
289 | private $logPrefix = 'OUT:'; |
290 | |
291 | public $haveTrimmedWsDSR = false; |
292 | |
293 | /** |
294 | * @param WikitextSerializer $serializer |
295 | * @param array $options List of options for serialization: |
296 | * - onSOL: (bool) |
297 | * - inPHPBlock: (bool) |
298 | * - inAttribute: (bool) |
299 | * - protect: (string) |
300 | * - selserData: (SelserData) |
301 | */ |
302 | public function __construct( WikitextSerializer $serializer, array $options = [] ) { |
303 | $this->env = $serializer->env; |
304 | $this->serializer = $serializer; |
305 | $this->extApi = new ParsoidExtensionAPI( $this->env, [ 'html2wt' => [ 'state' => $this ] ] ); |
306 | $this->onSOL = $options['onSOL'] ?? $this->onSOL; |
307 | $this->inPHPBlock = $options['inPHPBlock'] ?? $this->inPHPBlock; |
308 | $this->inAttribute = $options['inAttribute'] ?? $this->inAttribute; |
309 | $this->protect = $options['protect'] ?? null; |
310 | $this->selserData = $options['selserData'] ?? null; |
311 | $this->resetCurrLine( null ); |
312 | $this->singleLineContext = new SingleLineContext(); |
313 | $this->resetSep(); |
314 | $this->haveTrimmedWsDSR = Semver::satisfies( $this->env->getInputContentVersion(), '>=2.1.1' ); |
315 | $this->separators = new Separators( $this->env, $this ); |
316 | } |
317 | |
318 | /** |
319 | * @note Porting note: this replaces direct access |
320 | * @return Env |
321 | */ |
322 | public function getEnv(): Env { |
323 | return $this->env; |
324 | } |
325 | |
326 | /** |
327 | * Initialize a few boolean flags based on serialization mode. |
328 | * FIXME: Ideally, this should be private. Requires shuffing around |
329 | * where SerializerState is constructed so that $selserMode is known |
330 | * at the time of construction. |
331 | * @private for use by WikitextSerializer only |
332 | * @param bool $selserMode Are we running selective serialization? |
333 | */ |
334 | public function initMode( bool $selserMode ): void { |
335 | $this->useWhitespaceHeuristics = |
336 | Semver::satisfies( $this->env->getInputContentVersion(), '>=1.7.0' ); |
337 | $this->selserMode = $selserMode; |
338 | } |
339 | |
340 | /** |
341 | * Appends the separator source to the separator src buffer. |
342 | * Don't update $state->onSOL since this string hasn't been emitted yet. |
343 | * If content handlers change behavior based on whether this newline will |
344 | * be emitted or not, they should peek into this buffer (ex: see TDHandler |
345 | * and THHandler code). |
346 | * |
347 | * @param string $src |
348 | */ |
349 | public function appendSep( string $src ): void { |
350 | $this->sep->src = ( $this->sep->src ?: '' ) . $src; |
351 | } |
352 | |
353 | /** |
354 | * Cycle the state after processing a node. |
355 | * @param Node $node |
356 | */ |
357 | public function updateSep( Node $node ): void { |
358 | $this->sep->lastSourceNode = $node; |
359 | } |
360 | |
361 | private function resetSep() { |
362 | $this->sep = (object)[ |
363 | 'constraints' => null, |
364 | 'src' => null, |
365 | 'lastSourceNode' => null, |
366 | ]; |
367 | } |
368 | |
369 | /** |
370 | * Reset the current line state. |
371 | * @param ?Node $node |
372 | */ |
373 | private function resetCurrLine( ?Node $node ): void { |
374 | $this->currLine = (object)[ |
375 | 'text' => '', |
376 | 'chunks' => [], |
377 | 'firstNode' => $node |
378 | ]; |
379 | } |
380 | |
381 | /** |
382 | * Process and emit a line of ConstrainedText chunks, adjusting chunk boundaries as necessary. |
383 | * (Start of line and end of line are always safe for ConstrainedText chunks, so we don't need |
384 | * to buffer more than the last line.) |
385 | */ |
386 | private function flushLine(): void { |
387 | $this->out .= ConstrainedText::escapeLine( $this->currLine->chunks ); |
388 | $this->currLine->chunks = []; |
389 | } |
390 | |
391 | /** |
392 | * Extracts a subset of the page source bound by the supplied indices. |
393 | * @param int $start Start offset, in bytes |
394 | * @param int $end End offset, in bytes |
395 | * @return string|null |
396 | */ |
397 | public function getOrigSrc( int $start, int $end ): ?string { |
398 | Assert::invariant( $this->selserMode, 'SerializerState::$selserMode must be set' ); |
399 | if ( |
400 | $start <= $end && |
401 | // FIXME: Having a $start greater than the source length is |
402 | // probably a canary for corruption. Maybe we should be throwing |
403 | // here instead. See T240053. |
404 | // But, see comment in UnpackDOMFragments where we very very rarely |
405 | // can deliberately set DSR to point outside page source. |
406 | $start <= strlen( $this->selserData->oldText ) |
407 | ) { |
408 | return substr( $this->selserData->oldText, $start, $end - $start ); |
409 | } else { |
410 | return null; |
411 | } |
412 | } |
413 | |
414 | /** |
415 | * Like it says on the tin. |
416 | * @param Node $node |
417 | */ |
418 | public function updateModificationFlags( Node $node ): void { |
419 | $this->prevNodeUnmodified = $this->currNodeUnmodified; |
420 | $this->currNodeUnmodified = false; |
421 | $this->prevNode = $node; |
422 | } |
423 | |
424 | /** |
425 | * Separators put us in SOL state. |
426 | * @param string $sep |
427 | * @param Node $node |
428 | */ |
429 | private function sepIntroducedSOL( string $sep, Node $node ): void { |
430 | // Don't get tripped by newlines in comments! Be wary of nowikis added |
431 | // by makeSepIndentPreSafe on the last line. |
432 | $nonCommentSep = preg_replace( Utils::COMMENT_REGEXP, '', $sep ); |
433 | if ( substr( $nonCommentSep, -1 ) === "\n" ) { |
434 | $this->onSOL = true; |
435 | } |
436 | |
437 | if ( str_contains( $nonCommentSep, "\n" ) ) { |
438 | // process escapes in our full line |
439 | $this->flushLine(); |
440 | $this->resetCurrLine( $node ); |
441 | } |
442 | } |
443 | |
444 | /** |
445 | * Accumulates chunks on the current line. |
446 | * @param ConstrainedText $chunk |
447 | * @param string $logPrefix |
448 | */ |
449 | private function pushToCurrLine( ConstrainedText $chunk, string $logPrefix ) { |
450 | // Emitting text that has not been escaped |
451 | $this->currLine->text .= $chunk->text; |
452 | |
453 | $this->currLine->chunks[] = $chunk; |
454 | |
455 | $this->serializer->trace( '--->', $logPrefix, static function () use ( $chunk ) { |
456 | return PHPUtils::jsonEncode( $chunk->text ); |
457 | } ); |
458 | } |
459 | |
460 | /** |
461 | * Pushes the separator to the current line and resets the separator state. |
462 | * @param string $sep |
463 | * @param Node $node |
464 | * @param string $debugPrefix |
465 | */ |
466 | private function emitSep( string $sep, Node $node, string $debugPrefix ): void { |
467 | $sep = ConstrainedText::cast( $sep, $node ); |
468 | |
469 | // Replace newlines if we're in a single-line context |
470 | if ( $this->singleLineContext->enforced() ) { |
471 | $sep->text = preg_replace( '/\n/', ' ', $sep->text ); |
472 | } |
473 | |
474 | $this->pushToCurrLine( $sep, $debugPrefix ); |
475 | $this->sepIntroducedSOL( $sep->text, $node ); |
476 | |
477 | // Reset separator state |
478 | $this->resetSep(); |
479 | $this->updateSep( $node ); |
480 | } |
481 | |
482 | /** |
483 | * Determines if we can use the original separator for this node or if we |
484 | * need to build one based on its constraints, and then emits it. |
485 | * |
486 | * @param Node $node |
487 | */ |
488 | private function emitSepForNode( Node $node ): void { |
489 | /* When block nodes are deleted, the deletion affects whether unmodified |
490 | * newline separators between a pair of unmodified P tags can be reused. |
491 | * |
492 | * Example: |
493 | * ``` |
494 | * Original WT : "<div>x</div>foo\nbar" |
495 | * Original HTML: "<div>x</div><p>foo</p>\n<p>bar</p>" |
496 | * Edited HTML : "<p>foo</p>\n<p>bar</p>" |
497 | * Annotated DOM: "<mw:DiffMarker is-block><p>foo</p>\n<p>bar</p>" |
498 | * Expected WT : "foo\n\nbar" |
499 | * ``` |
500 | * |
501 | * Note the additional newline between "foo" and "bar" even though originally, |
502 | * there was just a single newline. |
503 | * |
504 | * So, even though the two P tags and the separator between them is |
505 | * unmodified, it is insufficient to rely on just that. We have to look at |
506 | * what has happened on the two wikitext lines onto which the two P tags |
507 | * will get serialized. |
508 | * |
509 | * Now, if you check the code for `nextToDeletedBlockNodeInWT`, that code is |
510 | * not really looking at ALL the nodes before/after the nodes that could |
511 | * serialize onto the wikitext lines. It is looking at the immediately |
512 | * adjacent nodes, i.e. it is not necessary to look if a block-tag was |
513 | * deleted 2 or 5 siblings away. If we had to actually examine all of those, |
514 | * nodes, this would get very complex, and it would be much simpler to just |
515 | * discard the original separators => potentially lots of dirty diffs. |
516 | * |
517 | * To understand why it is sufficient (for correctness) to examine just |
518 | * the immediately adjacent nodes, let us look at an additional example. |
519 | * ``` |
520 | * Original WT : "a<div>b</div>c<div>d</div>e\nf" |
521 | * Original HTML: "<p>a</p><div>b</div><p>c</p><div>d</div><p>e</p>\n<p>f</p>" |
522 | * ``` |
523 | * Note how `<block>` tags and `<p>` tags interleave in the HTML. This would be |
524 | * the case always no matter how much inline content showed up between the |
525 | * block tags in wikitext. If the b-`<div>` was deleted, we don't care |
526 | * about it, since we still have the d-`<div>` before the P tag that preserves |
527 | * the correctness of the single `"\n"` separator. If the d-`<div>` was deleted, |
528 | * we conservatively ignore the original separator and let normal P-P constraints |
529 | * take care of it. At worst, we might generate a dirty diff in this scenario. */ |
530 | $origSepNeeded = ( $node !== $this->sep->lastSourceNode ); |
531 | $origSepUsable = $origSepNeeded && |
532 | ( |
533 | // first-content-node of <body> ($this->prevNode) |
534 | ( |
535 | DOMUtils::isBody( $this->prevNode ) && |
536 | $node->parentNode === $this->prevNode |
537 | ) |
538 | || |
539 | // unmodified sibling node of $this->prevNode |
540 | ( |
541 | $this->prevNode && $this->prevNodeUnmodified && |
542 | $node->parentNode === $this->prevNode->parentNode && |
543 | !WTSUtils::nextToDeletedBlockNodeInWT( $this->prevNode, true ) |
544 | ) |
545 | ) && |
546 | $this->currNodeUnmodified && !WTSUtils::nextToDeletedBlockNodeInWT( $node, false ); |
547 | |
548 | $origSep = null; |
549 | if ( $origSepUsable ) { |
550 | if ( $this->prevNode instanceof Element && $node instanceof Element ) { |
551 | '@phan-var Element $node';/** @var Element $node */ |
552 | $origSep = $this->getOrigSrc( |
553 | // <body> won't have DSR in body_only scenarios |
554 | ( DOMUtils::isBody( $this->prevNode ) ? |
555 | 0 : DOMDataUtils::getDataParsoid( $this->prevNode )->dsr->end ), |
556 | DOMDataUtils::getDataParsoid( $node )->dsr->start |
557 | ); |
558 | } elseif ( $this->sep->src && WTSUtils::isValidSep( $this->sep->src ) ) { |
559 | // We don't know where '$this->sep->src' comes from. So, reuse it |
560 | // only if it is a valid separator string. |
561 | $origSep = $this->sep->src; |
562 | } |
563 | } |
564 | |
565 | if ( $origSep !== null ) { |
566 | $this->emitSep( $origSep, $node, 'ORIG-SEP:' ); |
567 | } else { |
568 | $sep = $this->separators->buildSep( $node ); |
569 | $this->emitSep( $sep ?: '', $node, 'SEP:' ); |
570 | } |
571 | } |
572 | |
573 | /** |
574 | * Recovers and emits any trimmed whitespace for $node |
575 | * @param Node $node |
576 | * @param bool $leading |
577 | * if true, trimmed leading whitespace is emitted |
578 | * if false, trimmed railing whitespace is emitted |
579 | * @return string|null |
580 | */ |
581 | public function recoverTrimmedWhitespace( Node $node, bool $leading ): ?string { |
582 | $sep = $this->separators->recoverTrimmedWhitespace( $node, $leading ); |
583 | $this->serializer->trace( '--->', "TRIMMED-SEP:", static function () use ( $sep ) { |
584 | return PHPUtils::jsonEncode( $sep ); |
585 | } ); |
586 | return $sep; |
587 | } |
588 | |
589 | /** |
590 | * Pushes the chunk to the current line. |
591 | * @param ConstrainedText|string $res |
592 | * @param Node $node |
593 | */ |
594 | public function emitChunk( $res, Node $node ): void { |
595 | $res = ConstrainedText::cast( $res, $node ); |
596 | |
597 | // Replace newlines if we're in a single-line context |
598 | if ( $this->singleLineContext->enforced() ) { |
599 | $res->text = str_replace( "\n", ' ', $res->text ); |
600 | } |
601 | |
602 | // Emit separator first |
603 | if ( $res->noSep ) { |
604 | /* skip separators for internal tokens from SelSer */ |
605 | if ( $this->onSOL ) { |
606 | // process escapes in our full line |
607 | $this->flushLine(); |
608 | $this->resetCurrLine( $node ); |
609 | } |
610 | } else { |
611 | $this->emitSepForNode( $node ); |
612 | } |
613 | |
614 | $needsEscaping = $this->needsEscaping; |
615 | if ( $needsEscaping && $this->currNode instanceof Text ) { |
616 | $needsEscaping = !$this->inHTMLPre && ( $this->onSOL || !$this->currNodeUnmodified ); |
617 | } |
618 | |
619 | // Escape 'res' if necessary |
620 | if ( $needsEscaping ) { |
621 | $res = new ConstrainedText( [ |
622 | 'text' => $this->serializer->escapeWikitext( $this, $res->text, [ |
623 | 'node' => $node, |
624 | 'isLastChild' => DiffDOMUtils::nextNonDeletedSibling( $node ) === null, |
625 | ] ), |
626 | 'prefix' => $res->prefix, |
627 | 'suffix' => $res->suffix, |
628 | 'node' => $res->node, |
629 | ] ); |
630 | $this->needsEscaping = false; |
631 | } else { |
632 | // If 'res' is coming from selser and the current node is a paragraph tag, |
633 | // check if 'res' might need some leading chars nowiki-escaped before being output. |
634 | // Because of block-tag p-wrapping behavior, sol-sensitive characters that used to |
635 | // be in non-sol positions, but yet wrapped in p-tags, could end up in sol-position |
636 | // if those block tags get deleted during edits. |
637 | // |
638 | // Ex: a<div>foo</div>*b |
639 | // -- wt2html --> <p>a</p><div>foo<div><p>*b</p> |
640 | // -- EDIT --> <p>a</p><p>*b</p> |
641 | // -- html2wt --> a\n\n<nowiki>*</nowiki>b |
642 | // |
643 | // In this scenario, the <p>a</p>, <p>*b</p>, and <p>#c</p> |
644 | // will be marked unmodified and will be processed below. |
645 | if ( $this->selserMode |
646 | && $this->onSOL |
647 | && $this->currNodeUnmodified |
648 | // 'node' came from original Parsoid HTML unmodified. So, if its content |
649 | // needs nowiki-escaping, we know that the reason it didn't parse into |
650 | // lists/headings/whatever is because it didn't occur at the start of the |
651 | // line => it had a block-tag in the original wikitext. So if the previous |
652 | // node was also unmodified (and since it also came from original Parsoid |
653 | // HTML), we can safely infer that it couldn't have been an inline node or |
654 | // a P-tag (if it were, the p-wrapping code would have swallowed that content |
655 | // into 'node'). So, it would have to be some sort of block tag => this.onSOL |
656 | // couldn't have been true (because we could have serialized 'node' on the |
657 | // same line as the block tag) => we can save some effort by eliminating |
658 | // scenarios where 'this.prevNodeUnmodified' is true. |
659 | && !$this->prevNodeUnmodified |
660 | && DOMCompat::nodeName( $node ) === 'p' && !WTUtils::isLiteralHTMLNode( $node ) |
661 | ) { |
662 | $pChild = DiffDOMUtils::firstNonSepChild( $node ); |
663 | // If a text node, we have to make sure that the text doesn't |
664 | // get reparsed as non-text in the wt2html pipeline. |
665 | if ( $pChild instanceof Text ) { |
666 | $match = $res->matches( $this->solWikitextRegexp(), $this->env ); |
667 | if ( $match && isset( $match[2] ) ) { |
668 | if ( preg_match( '/^([\*#:;]|{\||.*=$)/D', $match[2] ) |
669 | // ! and | chars are harmless outside tables |
670 | || ( strspn( $match[2], '|!' ) && $this->wikiTableNesting > 0 ) |
671 | // indent-pres are suppressed inside <blockquote> |
672 | || ( preg_match( '/^ \S/', $match[2] ) |
673 | && !DOMUtils::hasNameOrHasAncestorOfName( $node, 'blockquote' ) ) |
674 | ) { |
675 | $res = ConstrainedText::cast( ( $match[1] ?: '' ) |
676 | . '<nowiki>' . substr( $match[2], 0, 1 ) . '</nowiki>' |
677 | . substr( $match[2], 1 ), $node ); |
678 | } |
679 | } |
680 | } |
681 | } |
682 | } |
683 | |
684 | // Output res |
685 | $this->pushToCurrLine( $res, $this->logPrefix ); |
686 | |
687 | // Update sol flag. Test for newlines followed by optional includeonly or comments |
688 | if ( !$res->matches( $this->solRegexp(), $this->env ) ) { |
689 | $this->onSOL = false; |
690 | } |
691 | |
692 | // We've emit something so we're no longer at SOO. |
693 | $this->atStartOfOutput = false; |
694 | } |
695 | |
696 | /** |
697 | * Serialize the children of a DOM node, sharing the global serializer state. |
698 | * Typically called by a DOM-based handler to continue handling its children. |
699 | * @param Element|DocumentFragment $node |
700 | * @param ?callable $wtEscaper ( $state, $text, $opts ) |
701 | * PORT-FIXME document better; should this be done via WikitextEscapeHandlers somehow? |
702 | * @param ?Node $firstChild |
703 | */ |
704 | public function serializeChildren( |
705 | Node $node, ?callable $wtEscaper = null, ?Node $firstChild = null |
706 | ): void { |
707 | // SSS FIXME: Unsure if this is the right thing always |
708 | if ( $wtEscaper ) { |
709 | $this->wteHandlerStack[] = $wtEscaper; |
710 | } |
711 | |
712 | $child = $firstChild ?: $node->firstChild; |
713 | while ( $child !== null ) { |
714 | // We always get the next child to process |
715 | $child = $this->serializer->serializeNode( $child ); |
716 | } |
717 | |
718 | if ( $wtEscaper ) { |
719 | array_pop( $this->wteHandlerStack ); |
720 | } |
721 | |
722 | // If we serialized children explicitly, |
723 | // we were obviously processing a modified node. |
724 | $this->currNodeUnmodified = false; |
725 | } |
726 | |
727 | /** |
728 | * Abstracts some steps taken in `serializeChildrenToString` and `serializeDOM` |
729 | * |
730 | * @param Element|DocumentFragment $node |
731 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
732 | * @internal For use by WikitextSerializer only |
733 | */ |
734 | public function kickOffSerialize( |
735 | Node $node, ?callable $wtEscaper = null |
736 | ): void { |
737 | $this->updateSep( $node ); |
738 | $this->currNodeUnmodified = false; |
739 | $this->updateModificationFlags( $node ); |
740 | $this->resetCurrLine( $node->firstChild ); |
741 | $this->serializeChildren( $node, $wtEscaper ); |
742 | // Emit child-parent seps. |
743 | $this->emitSepForNode( $node ); |
744 | // We've reached EOF, flush the remaining buffered text. |
745 | $this->flushLine(); |
746 | } |
747 | |
748 | /** |
749 | * Serialize children to a string |
750 | * |
751 | * FIXME(arlorla): Shouldn't affect the separator state, but accidents have |
752 | * have been known to happen. T109793 suggests using its own wts / state. |
753 | * |
754 | * @param Element|DocumentFragment $node |
755 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
756 | * @param string $inState |
757 | * @return string |
758 | */ |
759 | private function serializeChildrenToString( |
760 | Node $node, ?callable $wtEscaper, string $inState |
761 | ): string { |
762 | $states = [ 'inLink', 'inCaption', 'inIndentPre', 'inHTMLPre', 'inPHPBlock', 'inAttribute' ]; |
763 | Assert::parameter( in_array( $inState, $states, true ), '$inState', 'Must be one of: ' |
764 | . implode( ', ', $states ) ); |
765 | // FIXME: Make sure that the separators emitted here conform to the |
766 | // syntactic constraints of syntactic context. |
767 | $oldSep = $this->sep; |
768 | $oldSOL = $this->onSOL; |
769 | $oldOut = $this->out; |
770 | $oldStart = $this->atStartOfOutput; |
771 | $oldCurrLine = $this->currLine; |
772 | $oldLogPrefix = $this->logPrefix; |
773 | // Modification flags |
774 | $oldPrevNodeUnmodified = $this->prevNodeUnmodified; |
775 | $oldCurrNodeUnmodified = $this->currNodeUnmodified; |
776 | $oldPrevNode = $this->prevNode; |
777 | |
778 | $this->out = ''; |
779 | $this->logPrefix = 'OUT(C):'; |
780 | $this->resetSep(); |
781 | $this->onSOL = false; |
782 | $this->atStartOfOutput = false; |
783 | $this->$inState = true; |
784 | |
785 | $this->singleLineContext->disable(); |
786 | $this->kickOffSerialize( $node, $wtEscaper ); |
787 | $this->singleLineContext->pop(); |
788 | |
789 | // restore the state |
790 | $bits = $this->out; |
791 | $this->out = $oldOut; |
792 | $this->$inState = false; |
793 | $this->sep = $oldSep; |
794 | $this->onSOL = $oldSOL; |
795 | $this->atStartOfOutput = $oldStart; |
796 | $this->currLine = $oldCurrLine; |
797 | $this->logPrefix = $oldLogPrefix; |
798 | // Modification flags |
799 | $this->prevNodeUnmodified = $oldPrevNodeUnmodified; |
800 | $this->currNodeUnmodified = $oldCurrNodeUnmodified; |
801 | $this->prevNode = $oldPrevNode; |
802 | return $bits; |
803 | } |
804 | |
805 | /** |
806 | * Serialize children of a link to a string |
807 | * @param Element|DocumentFragment $node |
808 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
809 | * @return string |
810 | */ |
811 | public function serializeLinkChildrenToString( |
812 | Node $node, ?callable $wtEscaper = null |
813 | ): string { |
814 | return $this->serializeChildrenToString( $node, $wtEscaper, 'inLink' ); |
815 | } |
816 | |
817 | /** |
818 | * Serialize children of a caption to a string |
819 | * @param Element|DocumentFragment $node |
820 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
821 | * @return string |
822 | */ |
823 | public function serializeCaptionChildrenToString( |
824 | Node $node, ?callable $wtEscaper = null |
825 | ): string { |
826 | return $this->serializeChildrenToString( $node, $wtEscaper, 'inCaption' ); |
827 | } |
828 | |
829 | /** |
830 | * Serialize children of an indent-pre to a string |
831 | * @param Element|DocumentFragment $node |
832 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
833 | * @return string |
834 | */ |
835 | public function serializeIndentPreChildrenToString( |
836 | Node $node, ?callable $wtEscaper = null |
837 | ): string { |
838 | return $this->serializeChildrenToString( $node, $wtEscaper, 'inIndentPre' ); |
839 | } |
840 | |
841 | /** |
842 | * Take notes of the open annotation ranges and whether they have been extended. |
843 | * @param string $ann |
844 | * @param bool $extended |
845 | */ |
846 | public function openAnnotationRange( string $ann, bool $extended ) { |
847 | $this->openAnnotations[$ann] = $extended; |
848 | } |
849 | |
850 | /** |
851 | * Removes the corresponding annotation range from the list of open ranges. |
852 | * @param string $ann |
853 | */ |
854 | public function closeAnnotationRange( string $ann ) { |
855 | unset( $this->openAnnotations[$ann] ); |
856 | } |
857 | |
858 | } |