Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 581 |
|
0.00% |
0 / 22 |
CRAP | |
0.00% |
0 / 1 |
WikitextEscapeHandlers | |
0.00% |
0 / 581 |
|
0.00% |
0 / 22 |
61256 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
startsOnANewLine | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
hasBlocksOnLine | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
110 | |||
hasLeadingEscapableQuoteChar | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
20 | |||
hasTrailingEscapableQuoteChar | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
20 | |||
escapedIBSiblingNodeText | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
30 | |||
isFirstContentNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
liHandler | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
42 | |||
thHandler | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
mediaOptionHandler | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
wikilinkHandler | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
aHandler | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
tdHandler | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
182 | |||
tokenizeStr | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
textCanParseAsLink | |
0.00% |
0 / 52 |
|
0.00% |
0 / 1 |
552 | |||
hasWikitextTokens | |
0.00% |
0 / 68 |
|
0.00% |
0 / 1 |
1640 | |||
nowikiWrap | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
escapedText | |
0.00% |
0 / 78 |
|
0.00% |
0 / 1 |
600 | |||
escapeWikitext | |
0.00% |
0 / 111 |
|
0.00% |
0 / 1 |
2652 | |||
appendStr | |
0.00% |
0 / 37 |
|
0.00% |
0 / 1 |
552 | |||
escapeTplArgWT | |
0.00% |
0 / 127 |
|
0.00% |
0 / 1 |
812 | |||
escapeLinkContent | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\DOM\Element; |
9 | use Wikimedia\Parsoid\DOM\Node; |
10 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
11 | use Wikimedia\Parsoid\Tokens\EOFTk; |
12 | use Wikimedia\Parsoid\Tokens\SourceRange; |
13 | use Wikimedia\Parsoid\Tokens\TagTk; |
14 | use Wikimedia\Parsoid\Tokens\Token; |
15 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
16 | use Wikimedia\Parsoid\Utils\DOMCompat; |
17 | use Wikimedia\Parsoid\Utils\DOMUtils; |
18 | use Wikimedia\Parsoid\Utils\PHPUtils; |
19 | use Wikimedia\Parsoid\Utils\TokenUtils; |
20 | use Wikimedia\Parsoid\Utils\Utils; |
21 | use Wikimedia\Parsoid\Utils\WTUtils; |
22 | use Wikimedia\Parsoid\Wikitext\Consts; |
23 | use Wikimedia\Parsoid\Wt2Html\PegTokenizer; |
24 | |
25 | class WikitextEscapeHandlers { |
26 | |
27 | private const LINKS_ESCAPE_RE = '/(\[\[)|(\]\])|(-\{)|(^[^\[]*\]$)/D'; |
28 | |
29 | /** @var Env */ |
30 | private $env; |
31 | |
32 | /** @var ?string */ |
33 | private $extName; |
34 | |
35 | /** |
36 | * @var PegTokenizer |
37 | */ |
38 | private $tokenizer; |
39 | |
40 | public function __construct( Env $env, ?string $extName ) { |
41 | $this->env = $env; |
42 | $this->extName = $extName; |
43 | $this->tokenizer = new PegTokenizer( $env ); |
44 | } |
45 | |
46 | /** |
47 | * Ignore the cases where the serializer adds newlines not present in the dom |
48 | * @param Node $node |
49 | * @return bool |
50 | */ |
51 | private static function startsOnANewLine( Node $node ): bool { |
52 | $name = DOMCompat::nodeName( $node ); |
53 | return TokenUtils::tagOpensBlockScope( $name ) && |
54 | !WTUtils::isLiteralHTMLNode( $node ); |
55 | } |
56 | |
57 | /** |
58 | * Look ahead on current line for block content |
59 | * |
60 | * @param Node $node |
61 | * @param bool $first |
62 | * @return bool |
63 | */ |
64 | private static function hasBlocksOnLine( Node $node, bool $first ): bool { |
65 | // special case for firstNode: |
66 | // we're at sol so ignore possible \n at first char |
67 | if ( $first ) { |
68 | $textContent = $node->textContent; |
69 | $offset = strlen( $textContent ) ? 1 : 0; |
70 | if ( strpos( $textContent, "\n", $offset ) !== false ) { |
71 | return false; |
72 | } |
73 | $node = $node->nextSibling; |
74 | } |
75 | |
76 | while ( $node ) { |
77 | if ( $node instanceof Element ) { |
78 | if ( DOMUtils::isWikitextBlockNode( $node ) ) { |
79 | return !self::startsOnANewLine( $node ); |
80 | } |
81 | if ( $node->hasChildNodes() ) { |
82 | if ( self::hasBlocksOnLine( $node->firstChild, false ) ) { |
83 | return true; |
84 | } |
85 | } |
86 | } else { |
87 | if ( str_contains( $node->textContent, "\n" ) ) { |
88 | return false; |
89 | } |
90 | } |
91 | $node = $node->nextSibling; |
92 | } |
93 | return false; |
94 | } |
95 | |
96 | /** |
97 | * @param string $text |
98 | * @param array $opts [ 'node' => Node ] |
99 | * @return bool |
100 | */ |
101 | private static function hasLeadingEscapableQuoteChar( string $text, array $opts ): bool { |
102 | /** @var Node $node */ |
103 | $node = $opts['node']; |
104 | // Use 'node.textContent' to do the tests since it hasn't had newlines |
105 | // stripped out from it. |
106 | // Ex: For this DOM: <i>x</i>\n'\n<i>y</i> |
107 | // node.textContent = \n'\n and text = ' |
108 | // Those newline separators can prevent unnecessary <nowiki/> protection |
109 | // if the string begins with one or more newlines before a leading quote. |
110 | $origText = $node->textContent; |
111 | if ( substr( $origText, 0, 1 ) === "'" ) { |
112 | $prev = DiffDOMUtils::previousNonDeletedSibling( $node ); |
113 | if ( !$prev ) { |
114 | $prev = $node->parentNode; |
115 | } |
116 | if ( DOMUtils::isQuoteElt( $prev ) ) { |
117 | return true; |
118 | } |
119 | } |
120 | |
121 | return false; |
122 | } |
123 | |
124 | /** |
125 | * @param string $text |
126 | * @param array $opts [ 'node' => Node ] |
127 | * @return bool |
128 | */ |
129 | private static function hasTrailingEscapableQuoteChar( string $text, array $opts ): bool { |
130 | $node = $opts['node']; |
131 | // Use 'node.textContent' to do the tests since it hasn't had newlines |
132 | // stripped out from it. |
133 | // Ex: For this DOM: <i>x</i>\n'\n<i>y</i> |
134 | // node.textContent = \n'\n and text = ' |
135 | // Those newline separators can prevent unnecessary <nowiki/> protection |
136 | // if the string ends with a trailing quote and then one or more newlines. |
137 | $origText = $node->textContent; |
138 | if ( substr( $origText, -1 ) === "'" ) { |
139 | $next = DiffDOMUtils::nextNonDeletedSibling( $node ); |
140 | if ( !$next ) { |
141 | $next = $node->parentNode; |
142 | } |
143 | if ( DOMUtils::isQuoteElt( $next ) ) { |
144 | return true; |
145 | } |
146 | } |
147 | |
148 | return false; |
149 | } |
150 | |
151 | /** |
152 | * SSS FIXME: By doing a DOM walkahead to identify what else is on the current line, |
153 | * these heuristics can be improved. Ex: '<i>foo</i> blah blah does not require a |
154 | * <nowiki/> after the single quote since we know that there are no other quotes on |
155 | * the rest of the line that will be emitted. Similarly, '' does not need a <nowiki> |
156 | * wrapper since there are on other quote chars on the line. |
157 | * |
158 | * This is checking text-node siblings of i/b tags. |
159 | * |
160 | * @param SerializerState $state |
161 | * @param string $text |
162 | * @param array $opts [ 'node' => Node ] |
163 | * @return string |
164 | */ |
165 | private static function escapedIBSiblingNodeText( |
166 | SerializerState $state, string $text, array $opts |
167 | ): string { |
168 | // For a sequence of 2+ quote chars, we have to |
169 | // fully wrap the sequence in <nowiki>...</nowiki> |
170 | // <nowiki/> at the start and end doesn't work. |
171 | // |
172 | // Ex: ''<i>foo</i> should serialize to <nowiki>''</nowiki>''foo''. |
173 | // |
174 | // Serializing it to ''<nowiki/>''foo'' breaks html2html semantics |
175 | // since it will parse back to <i><meta../></i>foo<i></i> |
176 | if ( preg_match( "/''+/", $text ) ) { |
177 | // Minimize the length of the string that is wrapped in <nowiki>. |
178 | $pieces = explode( "'", $text ); |
179 | $first = array_shift( $pieces ); |
180 | $last = array_pop( $pieces ); |
181 | return $first . "<nowiki>'" . implode( "'", $pieces ) . "'</nowiki>" . $last; |
182 | } |
183 | |
184 | // Check whether the head and/or tail of the text needs <nowiki/> protection. |
185 | $out = ''; |
186 | if ( self::hasTrailingEscapableQuoteChar( $text, $opts ) ) { |
187 | $state->hasQuoteNowikis = true; |
188 | $out = $text . '<nowiki/>'; |
189 | } |
190 | |
191 | if ( self::hasLeadingEscapableQuoteChar( $text, $opts ) ) { |
192 | $state->hasQuoteNowikis = true; |
193 | $out = '<nowiki/>' . ( $out ?: $text ); |
194 | } |
195 | |
196 | return $out; |
197 | } |
198 | |
199 | public function isFirstContentNode( Node $node ): bool { |
200 | // Skip deleted-node markers |
201 | return DiffDOMUtils::previousNonDeletedSibling( $node ) === null; |
202 | } |
203 | |
204 | /** |
205 | * @param Node $liNode |
206 | * @param SerializerState $state |
207 | * @param string $text |
208 | * @param array $opts [ 'node' => Node ] |
209 | * @return bool |
210 | */ |
211 | public function liHandler( |
212 | Node $liNode, SerializerState $state, string $text, array $opts |
213 | ): bool { |
214 | /** @var Node $node */ |
215 | $node = $opts['node']; |
216 | if ( $node->parentNode !== $liNode ) { |
217 | return false; |
218 | } |
219 | |
220 | // For <dt> nodes, ":" trigger nowiki outside of elements |
221 | // For first nodes of <li>'s, bullets in sol posn trigger escaping |
222 | if ( DOMCompat::nodeName( $liNode ) === 'dt' && str_contains( $text, ':' ) ) { |
223 | return true; |
224 | } elseif ( preg_match( '/^[#*:;]*$/D', $state->currLine->text ) && |
225 | $this->isFirstContentNode( $node ) |
226 | ) { |
227 | // Wikitext styling might require whitespace insertion after list bullets. |
228 | // In those scenarios, presence of bullet-wiktext in the text node is okay. |
229 | // Hence the check for /^[#*:;]*$/ above. |
230 | return (bool)strspn( $text, '#*:;', 0, 1 ); |
231 | } else { |
232 | return false; |
233 | } |
234 | } |
235 | |
236 | /** |
237 | * @param Node $thNode |
238 | * @param SerializerState $state |
239 | * @param string $text |
240 | * @param array $opts [ 'node' => Node ] |
241 | * @return bool |
242 | */ |
243 | public function thHandler( |
244 | Node $thNode, SerializerState $state, string $text, array $opts |
245 | ): bool { |
246 | // {| |
247 | // !a<div>!!b</div> |
248 | // !c<div>||d</div> |
249 | // |} |
250 | // |
251 | // The <div> will get split across two <th> tags because |
252 | // the !! and | has higher precedence in the tokenizer. |
253 | // |
254 | // So, no matter where in the DOM subtree of the <th> node |
255 | // that text shows up in, we have to unconditionally escape |
256 | // the !! and | characters. |
257 | // |
258 | // That is, so long as it serializes to the same line as the |
259 | // heading was started. |
260 | return preg_match( '/^\s*!/', $state->currLine->text ) && |
261 | preg_match( '/^[^\n]*!!|\|/', $text ); |
262 | } |
263 | |
264 | public function mediaOptionHandler( SerializerState $state, string $text ): bool { |
265 | return str_contains( $text, '|' ) || preg_match( self::LINKS_ESCAPE_RE, $text ); |
266 | } |
267 | |
268 | public function wikilinkHandler( SerializerState $state, string $text ): bool { |
269 | return (bool)preg_match( self::LINKS_ESCAPE_RE, $text ); |
270 | } |
271 | |
272 | public function aHandler( SerializerState $state, string $text ): bool { |
273 | return str_contains( $text, ']' ); |
274 | } |
275 | |
276 | /** |
277 | * @param Node $tdNode |
278 | * @param bool $inWideTD |
279 | * @param SerializerState $state |
280 | * @param string $text |
281 | * @param array $opts [ 'node' => ?Node ] |
282 | * @return bool |
283 | */ |
284 | public function tdHandler( |
285 | Node $tdNode, bool $inWideTD, SerializerState $state, string $text, array $opts |
286 | ): bool { |
287 | $node = $opts['node'] ?? null; |
288 | /* |
289 | * "|" anywhere in a text node of the <td> subtree can be trouble! |
290 | * It is not sufficient to just look at immediate child of <td> |
291 | * Try parsing the following table: |
292 | * |
293 | * {| |
294 | * |a''b|c'' |
295 | * |} |
296 | * |
297 | * Similarly, "-" or "+" when emitted after a "|" in sol position |
298 | * is trouble, but in addition to showing up as the immediate first |
299 | * child of tdNode, they can appear on the leftmost path from |
300 | * tdNode as long as the path only has nodes don't emit any wikitext. |
301 | * Ex: <td><p>-</p></td>, but not: <td><small>-</small></td> |
302 | */ |
303 | |
304 | // If 'text' is on the same wikitext line as the "|" corresponding |
305 | // to the <td> |
306 | // * | in a td should be escaped |
307 | // * +-} in SOL position (if they show up on the leftmost path with |
308 | // only zero-wt-emitting nodes on that path) |
309 | if ( !$node || $state->currLine->firstNode === $tdNode ) { |
310 | if ( str_contains( $text, '|' ) ) { |
311 | return true; |
312 | } |
313 | if ( !$inWideTD && |
314 | $state->currLine->text === '|' && |
315 | strspn( $text, '-+}', 0, 1 ) && |
316 | $node |
317 | ) { |
318 | $n = $node; |
319 | while ( $n && $n !== $tdNode ) { |
320 | if ( !$this->isFirstContentNode( $n ) || |
321 | !( $n === $node || WTUtils::isZeroWidthWikitextElt( $n ) ) ) { |
322 | return false; |
323 | } |
324 | $n = $n->parentNode; |
325 | } |
326 | return true; |
327 | } |
328 | } |
329 | return false; |
330 | } |
331 | |
332 | /** |
333 | * Tokenize string and pop EOFTk |
334 | * |
335 | * @param string $str |
336 | * @param bool $sol |
337 | * @return array |
338 | */ |
339 | public function tokenizeStr( string $str, bool $sol ): array { |
340 | $tokens = $this->tokenizer->tokenizeSync( $str, [ 'sol' => $sol ] ); |
341 | Assert::invariant( |
342 | array_pop( $tokens ) instanceof EOFTk, |
343 | 'Expected EOF token!' |
344 | ); |
345 | return $tokens; |
346 | } |
347 | |
348 | public function textCanParseAsLink( Node $node, SerializerState $state, string $text ): bool { |
349 | $env = $state->getEnv(); |
350 | $env->log( |
351 | 'trace/wt-escape', 'link-test-text=', |
352 | static function () use ( $text ) { |
353 | return PHPUtils::jsonEncode( $text ); |
354 | } |
355 | ); |
356 | |
357 | // Strip away extraneous characters after a ]] or a ] |
358 | // They are inessential to the test of whether the ]]/] |
359 | // will get parsed into a wikilink and only complicate |
360 | // the logic (needing to ignore entities, etc.). |
361 | $text = preg_replace( '/\][^\]]*$/D', ']', $text, 1 ); |
362 | |
363 | // text only contains ']' chars. |
364 | // Since we stripped everything after ']' above, if a newline is |
365 | // present, a link would have to straddle newlines which is not valid. |
366 | if ( str_contains( $text, "\n" ) ) { |
367 | return false; |
368 | } |
369 | |
370 | $str = $state->currLine->text . $text; |
371 | $tokens = $this->tokenizeStr( $str, false ); // sol state is irrelevant here |
372 | $n = count( $tokens ); |
373 | $lastToken = $tokens[$n - 1]; |
374 | |
375 | $env->log( 'trace/wt-escape', 'str=', $str, ';tokens=', $tokens ); |
376 | |
377 | // If 'text' remained outside of any non-string tokens, |
378 | // it does not need nowiking. |
379 | if ( $lastToken === $text || |
380 | ( is_string( $lastToken ) && |
381 | $text === substr( $lastToken, -strlen( $text ) ) |
382 | ) |
383 | ) { |
384 | return false; |
385 | } |
386 | |
387 | // Verify that the tokenized links are valid links |
388 | $buf = ''; |
389 | for ( $i = $n - 1; $i >= 0; $i-- ) { |
390 | $t = $tokens[$i]; |
391 | if ( is_string( $t ) ) { |
392 | $buf = $t . $buf; |
393 | } elseif ( $t->getName() === 'wikilink' ) { |
394 | $target = $t->getAttributeV( 'href' ); |
395 | if ( is_array( $target ) ) { |
396 | // FIXME: in theory template expansion *could* make this a link. |
397 | return false; |
398 | } |
399 | if ( $env->isValidLinkTarget( $target ) && |
400 | !$env->getSiteConfig()->hasValidProtocol( $target ) |
401 | ) { |
402 | return true; |
403 | } |
404 | |
405 | // Assumes 'src' will always be present which it seems to be. |
406 | // Tests will fail if anything changes in the tokenizer. |
407 | $buf = $t->dataParsoid->src . $buf; |
408 | } elseif ( $t->getName() === 'extlink' ) { |
409 | // Check if the extlink came from a template which in the end |
410 | // would not really parse as an extlink. |
411 | |
412 | $href = $t->getAttributeV( 'href' ); |
413 | if ( is_array( $href ) ) { |
414 | $href = $href[0]; |
415 | } |
416 | |
417 | if ( !TokenUtils::isTemplateToken( $href ) ) { |
418 | // Not a template and a real href => needs nowiking |
419 | if ( is_string( $href ) && preg_match( '#https?://#', $href ) ) { |
420 | return true; |
421 | } |
422 | } else { |
423 | while ( $node ) { |
424 | $node = DiffDOMUtils::previousNonSepSibling( $node ); |
425 | if ( $node && WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
426 | // FIXME: This is not entirely correct. |
427 | // Assumes that extlink content doesn't have templates. |
428 | // Solution: Count # of non-nested templates encountered |
429 | // and skip over intermediate templates. |
430 | // var content = t.getAttribute('mw:content'); |
431 | // var n = intermediateNonNestedTemplates(content); |
432 | break; |
433 | } |
434 | } |
435 | |
436 | if ( $node instanceof Element && DOMCompat::nodeName( $node ) === 'a' && |
437 | $node->textContent === DOMCompat::getAttribute( $node, 'href' ) |
438 | ) { |
439 | // The template expands to an url link => needs nowiking |
440 | return true; |
441 | } |
442 | } |
443 | |
444 | // Since this will not parse to a real extlink, |
445 | // update buf with the wikitext src for this token. |
446 | $tsr = $t->dataParsoid->tsr; |
447 | $buf = $tsr->substr( $str ) . $buf; |
448 | } else { |
449 | // We have no other smarts => be conservative. |
450 | return true; |
451 | } |
452 | |
453 | if ( $text === substr( $buf, -strlen( $text ) ) ) { |
454 | // 'text' emerged unscathed |
455 | return false; |
456 | } |
457 | } |
458 | |
459 | // We couldn't prove safety of skipping nowiki-ing. |
460 | return true; |
461 | } |
462 | |
463 | private function hasWikitextTokens( |
464 | SerializerState $state, bool $onNewline, string $text |
465 | ): bool { |
466 | $env = $state->getEnv(); |
467 | $env->log( |
468 | 'trace/wt-escape', 'nl:', $onNewline, ':text=', |
469 | static function () use ( $text ) { |
470 | return PHPUtils::jsonEncode( $text ); |
471 | } |
472 | ); |
473 | |
474 | // tokenize the text |
475 | $sol = $onNewline && !( $state->inIndentPre || $state->inPHPBlock ); |
476 | |
477 | // If we're inside a <pre>, we need to add an extra space after every |
478 | // newline so that the tokenizer correctly parses all tokens in a pre |
479 | // instead of just the first one. See T95794. |
480 | if ( $state->inIndentPre ) { |
481 | $text = str_replace( "\n", "\n ", $text ); |
482 | } |
483 | |
484 | $tokens = $this->tokenizeStr( $text, $sol ); |
485 | |
486 | // If the token stream has a TagTk, SelfclosingTagTk, EndTagTk or CommentTk |
487 | // then this text needs escaping! |
488 | $numEntities = 0; |
489 | foreach ( $tokens as $t ) { |
490 | $env->log( |
491 | 'trace/wt-escape', 'T:', |
492 | static function () use ( $t ) { |
493 | return PHPUtils::jsonEncode( $t ); |
494 | } |
495 | ); |
496 | |
497 | $tc = TokenUtils::getTokenType( $t ); |
498 | |
499 | // Ignore html tags that aren't allowed as literals in wikitext |
500 | if ( TokenUtils::isHTMLTag( $t ) ) { |
501 | if ( |
502 | TokenUtils::matchTypeOf( $t, '#^mw:Extension(/|$)#' ) && |
503 | ( $this->extName !== $t->getAttributeV( 'name' ) ) |
504 | ) { |
505 | return true; |
506 | } |
507 | |
508 | // Always escape isolated extension tags (T59469). Consider this: |
509 | // echo "<ref>foo<p></ref></p>" | node parse --html2wt |
510 | // The <ref> and </ref> tag-like text is spread across the DOM, and in |
511 | // the worst case can be anywhere. So, we conservatively escape these |
512 | // elements always (which can lead to excessive nowiki-escapes in some |
513 | // cases, but is always safe). |
514 | if ( ( $tc === 'TagTk' || $tc === 'EndTagTk' ) && |
515 | $env->getSiteConfig()->isExtensionTag( mb_strtolower( $t->getName() ) ) |
516 | ) { |
517 | return true; |
518 | } |
519 | |
520 | // If the tag is one that's allowed in wikitext, we need to escape |
521 | // it inside <nowiki>s, because a text node nodeValue always returns |
522 | // non-escaped entities (that is, converts "<h2>" to "<h2>"). |
523 | // TODO: We should also do this for <a> tags because even if they |
524 | // aren't allowed in wikitext and thus don't need to be escaped, the |
525 | // result can be confusing for editors. However, doing it here in a |
526 | // simple way interacts badly with normal link escaping, so it's |
527 | // left for later. |
528 | if ( isset( Consts::$Sanitizer['AllowedLiteralTags'][mb_strtolower( $t->getName() )] ) ) { |
529 | return true; |
530 | } else { |
531 | continue; |
532 | } |
533 | } |
534 | |
535 | if ( $tc === 'SelfclosingTagTk' ) { |
536 | // * Ignore RFC/ISBN/PMID tokens when those are encountered in the |
537 | // context of another link's content -- those are not parsed to |
538 | // ext-links in that context. (T109371) |
539 | if ( ( $t->getName() === 'extlink' || $t->getName() === 'wikilink' ) && |
540 | ( $t->dataParsoid->stx ?? null ) === 'magiclink' && |
541 | ( $state->inAttribute || $state->inLink ) ) { |
542 | continue; |
543 | } |
544 | |
545 | // Ignore url links in attributes (href, mostly) |
546 | // since they are not in danger of being autolink-ified there. |
547 | if ( $t->getName() === 'urllink' && ( $state->inAttribute || $state->inLink ) ) { |
548 | continue; |
549 | } |
550 | |
551 | if ( $t->getName() === 'wikilink' ) { |
552 | if ( $env->isValidLinkTarget( $t->getAttributeV( 'href' ) ?? '' ) ) { |
553 | return true; |
554 | } else { |
555 | continue; |
556 | } |
557 | } |
558 | |
559 | return true; |
560 | } |
561 | |
562 | if ( $state->inCaption && $tc === 'TagTk' && $t->getName() === 'listItem' ) { |
563 | continue; |
564 | } |
565 | |
566 | if ( $tc === 'TagTk' ) { |
567 | // Ignore mw:Entity tokens |
568 | if ( $t->getName() === 'span' && TokenUtils::hasTypeOf( $t, 'mw:Entity' ) ) { |
569 | $numEntities++; |
570 | continue; |
571 | } |
572 | |
573 | // Ignore table tokens outside of tables |
574 | if ( in_array( $t->getName(), [ 'caption', 'td', 'tr', 'th' ], true ) && |
575 | !TokenUtils::isHTMLTag( $t ) && |
576 | $state->wikiTableNesting === 0 |
577 | ) { |
578 | continue; |
579 | } |
580 | |
581 | // Headings have both SOL and EOL requirements. This tokenization |
582 | // here only verifies SOL requirements, not EOL requirements. |
583 | // So, record this information so that we can strip unnecessary |
584 | // nowikis after the fact. |
585 | if ( preg_match( '/^h\d$/D', $t->getName() ) ) { |
586 | $state->hasHeadingEscapes = true; |
587 | } |
588 | |
589 | return true; |
590 | } |
591 | |
592 | if ( $tc === 'EndTagTk' ) { |
593 | // Ignore mw:Entity tokens |
594 | if ( $numEntities > 0 && $t->getName() === 'span' ) { |
595 | $numEntities--; |
596 | continue; |
597 | } |
598 | // Ignore heading tokens |
599 | if ( preg_match( '/^h\d$/D', $t->getName() ) ) { |
600 | continue; |
601 | } |
602 | |
603 | // Ignore table tokens outside of tables |
604 | if ( isset( ( [ 'caption' => 1, 'table' => 1 ] )[$t->getName()] ) && |
605 | $state->wikiTableNesting === 0 |
606 | ) { |
607 | continue; |
608 | } |
609 | |
610 | // </br>! |
611 | if ( mb_strtolower( $t->getName() ) === 'br' ) { |
612 | continue; |
613 | } |
614 | |
615 | return true; |
616 | } |
617 | } |
618 | |
619 | return false; |
620 | } |
621 | |
622 | private static function nowikiWrap( |
623 | string $str, bool $close, bool &$inNowiki, bool &$nowikisAdded, string &$buf |
624 | ): void { |
625 | if ( !$inNowiki ) { |
626 | $buf .= '<nowiki>'; |
627 | $inNowiki = true; |
628 | $nowikisAdded = true; |
629 | } |
630 | $buf .= $str; |
631 | if ( $close ) { |
632 | $buf .= '</nowiki>'; |
633 | $inNowiki = false; |
634 | } |
635 | } |
636 | |
637 | /** |
638 | * This function attempts to wrap smallest escapable units into |
639 | * nowikis (which can potentially add multiple nowiki pairs in a |
640 | * single string). The idea here is that since this should all be |
641 | * text, anything that tokenizes to another construct needs to be |
642 | * wrapped. |
643 | * |
644 | * Full-wrapping is enabled if the string is being escaped within |
645 | * context-specific handlers where the tokenization context might |
646 | * be different from what we use in this code. |
647 | * |
648 | * @param SerializerState $state |
649 | * @param bool $sol |
650 | * @param string $origText |
651 | * @param bool $fullWrap |
652 | * @param bool $dontWrapIfUnnecessary |
653 | * @return string |
654 | */ |
655 | public function escapedText( |
656 | SerializerState $state, bool $sol, string $origText, |
657 | bool $fullWrap = false, bool $dontWrapIfUnnecessary = false |
658 | ): string { |
659 | Assert::invariant( |
660 | preg_match( '/^(.*?)((?:\r?\n)*)$/sD', $origText, $match ), |
661 | "Escaped text matching failed: {$origText}" |
662 | ); |
663 | |
664 | $text = $match[1]; |
665 | $nls = $match[2]; |
666 | |
667 | if ( $fullWrap ) { |
668 | return '<nowiki>' . $text . '</nowiki>' . $nls; |
669 | } |
670 | |
671 | $buf = ''; |
672 | $inNowiki = false; |
673 | $nowikisAdded = false; |
674 | // These token types don't come with a closing tag |
675 | $tokensWithoutClosingTag = PHPUtils::makeSet( [ 'listItem', 'td', 'tr' ] ); |
676 | |
677 | // reverse escaping nowiki tags |
678 | // we do this so that they tokenize as nowikis |
679 | // instead of entity enclosed text |
680 | $text = preg_replace( '#<(/?nowiki\s*/?\s*)>#i', '<$1>', $text ); |
681 | |
682 | $tokens = $this->tokenizeStr( $text, $sol ); |
683 | |
684 | foreach ( $tokens as $t ) { |
685 | if ( is_string( $t ) ) { |
686 | if ( strlen( $t ) > 0 ) { |
687 | $t = WTSUtils::escapeNowikiTags( $t ); |
688 | if ( !$inNowiki && ( ( $sol && $t[0] === ' ' ) || str_contains( $t, "\n " ) ) ) { |
689 | $x = preg_split( '/(^|\n) /', $t, -1, PREG_SPLIT_DELIM_CAPTURE ); |
690 | $buf .= $x[0]; |
691 | $lastIndexX = count( $x ) - 1; |
692 | for ( $k = 1; $k < $lastIndexX; $k += 2 ) { |
693 | $buf .= $x[$k]; |
694 | if ( $k !== 1 || $x[$k] === "\n" || $sol ) { |
695 | self::nowikiWrap( ' ', true, $inNowiki, $nowikisAdded, $buf ); |
696 | } else { |
697 | $buf .= ' '; |
698 | } |
699 | $buf .= $x[$k + 1]; |
700 | } |
701 | } else { |
702 | $buf .= $t; |
703 | } |
704 | $sol = false; |
705 | } |
706 | continue; |
707 | } |
708 | |
709 | $tsr = $t->dataParsoid->tsr ?? null; |
710 | if ( !( $tsr instanceof SourceRange ) ) { |
711 | $env = $state->getEnv(); |
712 | $env->log( |
713 | 'error/html2wt/escapeNowiki', |
714 | 'Missing tsr for token ', |
715 | PHPUtils::jsonEncode( $t ), |
716 | 'while processing text ', |
717 | $text |
718 | ); |
719 | |
720 | // Bail and wrap the whole thing in a nowiki |
721 | // if we have missing information. |
722 | // Use match[1] since text has been clobbered above. |
723 | return '<nowiki>' . $match[1] . '</nowiki>' . $nls; |
724 | } |
725 | |
726 | // Now put back the escaping we removed above |
727 | $tSrc = WTSUtils::escapeNowikiTags( $tsr->substr( $text ) ); |
728 | switch ( TokenUtils::getTokenType( $t ) ) { |
729 | case 'NlTk': |
730 | $buf .= $tSrc; |
731 | $sol = true; |
732 | break; |
733 | case 'CommentTk': |
734 | // Comments are sol-transparent |
735 | $buf .= $tSrc; |
736 | break; |
737 | case 'TagTk': |
738 | // Treat tokens with missing tags as self-closing tokens |
739 | // for the purpose of minimal nowiki escaping |
740 | self::nowikiWrap( |
741 | $tSrc, |
742 | isset( $tokensWithoutClosingTag[$t->getName()] ), |
743 | $inNowiki, |
744 | $nowikisAdded, |
745 | $buf |
746 | ); |
747 | $sol = false; |
748 | break; |
749 | case 'EndTagTk': |
750 | self::nowikiWrap( $tSrc, true, $inNowiki, $nowikisAdded, $buf ); |
751 | $sol = false; |
752 | break; |
753 | case 'SelfclosingTagTk': |
754 | if ( $t->getName() !== 'meta' || |
755 | !TokenUtils::hasTypeOf( $t, 'mw:EmptyLine' ) |
756 | ) { |
757 | // Don't bother with marker or empty-line metas |
758 | self::nowikiWrap( $tSrc, true, $inNowiki, $nowikisAdded, $buf ); |
759 | } |
760 | $sol = false; |
761 | break; |
762 | } |
763 | } |
764 | |
765 | // close any unclosed nowikis |
766 | if ( $inNowiki ) { |
767 | $buf .= '</nowiki>'; |
768 | } |
769 | |
770 | // Make sure nowiki is always added |
771 | // Ex: "foo]]" won't tokenize into tags at all |
772 | if ( !$nowikisAdded && !$dontWrapIfUnnecessary ) { |
773 | $buf = ''; |
774 | self::nowikiWrap( $text, true, $inNowiki, $nowikisAdded, $buf ); |
775 | } |
776 | |
777 | $buf .= $nls; |
778 | return $buf; |
779 | } |
780 | |
781 | /** |
782 | * @param SerializerState $state |
783 | * @param string $text |
784 | * @param array $opts [ 'node' => Node, 'inMultilineMode' => ?bool, 'isLastChild' => ?bool ] |
785 | * @return string |
786 | */ |
787 | public function escapeWikitext( SerializerState $state, string $text, array $opts ): string { |
788 | $env = $state->getEnv(); |
789 | $env->log( |
790 | 'trace/wt-escape', 'EWT:', |
791 | static function () use ( $text ) { |
792 | return PHPUtils::jsonEncode( $text ); |
793 | } |
794 | ); |
795 | |
796 | /* ----------------------------------------------------------------- |
797 | * General strategy: If a substring requires escaping, we can escape |
798 | * the entire string without further analysis of the rest of the string. |
799 | * ----------------------------------------------------------------- */ |
800 | |
801 | $hasMagicWord = preg_match( '/(^|\W)(RFC|ISBN|PMID)\s/', $text ); |
802 | $hasAutolink = $env->getSiteConfig()->findValidProtocol( $text ); |
803 | $fullCheckNeeded = !$state->inLink && ( $hasMagicWord || $hasAutolink ); |
804 | $hasQuoteChar = false; |
805 | $indentPreUnsafe = false; |
806 | $hasNonQuoteEscapableChars = false; |
807 | $indentPreSafeMode = $state->inIndentPre || $state->inPHPBlock; |
808 | $sol = $state->onSOL && !$indentPreSafeMode; |
809 | |
810 | // Fast path for special protected characters. |
811 | if ( $state->protect && preg_match( $state->protect, $text ) ) { |
812 | return $this->escapedText( $state, $sol, $text ); |
813 | } |
814 | |
815 | if ( !$fullCheckNeeded ) { |
816 | $hasQuoteChar = str_contains( $text, "'" ); |
817 | $indentPreUnsafe = !$indentPreSafeMode && ( |
818 | preg_match( '/\n +[^\r\n]*?\S+/', $text ) || |
819 | $sol && preg_match( '/^ +[^\r\n]*?\S+/', $text ) |
820 | ); |
821 | $hasNonQuoteEscapableChars = preg_match( '/[<>\[\]\-\+\|!=#\*:;~{}]|__[^_]*__/', $text ); |
822 | $hasLanguageConverter = preg_match( '/-\{|\}-/', $text ); |
823 | if ( $hasLanguageConverter ) { |
824 | $fullCheckNeeded = true; |
825 | } |
826 | } |
827 | |
828 | // Quick check for the common case (useful to kill a majority of requests) |
829 | // |
830 | // Pure white-space or text without wt-special chars need not be analyzed |
831 | if ( !$fullCheckNeeded && !$hasQuoteChar && !$indentPreUnsafe && !$hasNonQuoteEscapableChars ) { |
832 | $env->log( 'trace/wt-escape', '---No-checks needed---' ); |
833 | return $text; |
834 | } |
835 | |
836 | // Context-specific escape handler |
837 | $wteHandler = PHPUtils::lastItem( $state->wteHandlerStack ); |
838 | if ( $wteHandler && $wteHandler( $state, $text, $opts ) ) { |
839 | $env->log( 'trace/wt-escape', '---Context-specific escape handler---' ); |
840 | return $this->escapedText( $state, false, $text, true ); |
841 | } |
842 | |
843 | // Quote-escape test |
844 | if ( str_contains( $text, "''" ) || |
845 | self::hasLeadingEscapableQuoteChar( $text, $opts ) || |
846 | self::hasTrailingEscapableQuoteChar( $text, $opts ) |
847 | ) { |
848 | // Check if we need full-wrapping <nowiki>..</nowiki> |
849 | // or selective <nowiki/> escaping for quotes. |
850 | if ( $fullCheckNeeded || |
851 | $indentPreUnsafe || |
852 | ( $hasNonQuoteEscapableChars && |
853 | $this->hasWikitextTokens( $state, $sol, $text ) |
854 | ) |
855 | ) { |
856 | $env->log( 'trace/wt-escape', '---quotes: escaping text---' ); |
857 | // If the reason for full wrap is that the text contains non-quote |
858 | // escapable chars, it's still possible to minimize the contents |
859 | // of the <nowiki> (T71950). |
860 | return $this->escapedText( $state, $sol, $text ); |
861 | } else { |
862 | $quoteEscapedText = self::escapedIBSiblingNodeText( $state, $text, $opts ); |
863 | if ( $quoteEscapedText ) { |
864 | $env->log( 'trace/wt-escape', '---sibling of i/b tag---' ); |
865 | return $quoteEscapedText; |
866 | } |
867 | } |
868 | } |
869 | |
870 | // Template and template-arg markers are escaped unconditionally! |
871 | // Conditional escaping requires matching brace pairs and knowledge |
872 | // of whether we are in template arg context or not. |
873 | if ( preg_match( '/\{\{\{|\{\{|\}\}\}|\}\}/', $text ) ) { |
874 | $env->log( 'trace/wt-escape', '---Unconditional: transclusion chars---' ); |
875 | return $this->escapedText( $state, false, $text ); |
876 | } |
877 | |
878 | // Once we eliminate the possibility of multi-line tokens, split the text |
879 | // around newlines and escape each line separately. |
880 | if ( preg_match( '/\n./', $text ) ) { |
881 | $env->log( 'trace/wt-escape', '-- <multi-line-escaping-mode> --' ); |
882 | // We've already processed the full string in a context-specific handler. |
883 | // No more additional processing required. So, push/pop a null handler. |
884 | $state->wteHandlerStack[] = null; |
885 | |
886 | $tmp = []; |
887 | foreach ( explode( "\n", $text ) as $i => $line ) { |
888 | if ( $i > 0 ) { |
889 | // Update state |
890 | $state->onSOL = true; |
891 | $state->currLine->text = ''; |
892 | $opts['inMultilineMode'] = true; |
893 | } |
894 | $tmp[] = $this->escapeWikitext( $state, $line, $opts ); |
895 | } |
896 | $ret = implode( "\n", $tmp ); |
897 | |
898 | array_pop( $state->wteHandlerStack ); |
899 | |
900 | // If nothing changed, check if the original multiline string has |
901 | // any wikitext tokens (ex: multi-line html tags <div\n>foo</div\n>). |
902 | if ( $ret === $text && $this->hasWikitextTokens( $state, $sol, $text ) ) { |
903 | $env->log( 'trace/wt-escape', '---Found multi-line wt tokens---' ); |
904 | $ret = $this->escapedText( $state, $sol, $text ); |
905 | } |
906 | |
907 | $env->log( 'trace/wt-escape', '-- </multi-line-escaping-mode> --' ); |
908 | return $ret; |
909 | } |
910 | |
911 | $env->log( |
912 | 'trace/wt-escape', 'SOL:', $sol, |
913 | static function () use ( $text ) { |
914 | return PHPUtils::jsonEncode( $text ); |
915 | } |
916 | ); |
917 | |
918 | $hasTildes = preg_match( '/~{3,5}/', $text ); |
919 | if ( !$fullCheckNeeded && !$hasTildes ) { |
920 | // {{, {{{, }}}, }} are handled above. |
921 | // Test 1: '', [], <>, __FOO__ need escaping wherever they occur |
922 | // = needs escaping in end-of-line context |
923 | // Test 2: {|, |}, ||, |-, |+, , *#:;, ----, =*= need escaping only in SOL context. |
924 | if ( !$sol && !preg_match( "/''|[<>]|\\[.*\\]|\\]|(=[ ]*(\\n|$))|__[^_]*__/", $text ) ) { |
925 | // It is not necessary to test for an unmatched opening bracket ([) |
926 | // as long as we always escape an unmatched closing bracket (]). |
927 | $env->log( 'trace/wt-escape', '---Not-SOL and safe---' ); |
928 | return $text; |
929 | } |
930 | |
931 | // Quick checks when on a newline |
932 | // + can only occur as "|+" and - can only occur as "|-" or ---- |
933 | if ( $sol && !preg_match( '/(^|\n)[ #*:;=]|[<\[\]>\|\'!]|\-\-\-\-|__[^_]*__/', $text ) ) { |
934 | $env->log( 'trace/wt-escape', '---SOL and safe---' ); |
935 | return $text; |
936 | } |
937 | } |
938 | |
939 | // The front-end parser eliminated pre-tokens in the tokenizer |
940 | // and moved them to a stream handler. So, we always conservatively |
941 | // escape text with ' ' in sol posn with one caveat: |
942 | // * and when the current line has block tokens |
943 | if ( $indentPreUnsafe && |
944 | ( !self::hasBlocksOnLine( $state->currLine->firstNode, true ) || |
945 | !empty( $opts['inMultilineMode'] ) |
946 | ) |
947 | ) { |
948 | $env->log( 'trace/wt-escape', '---SOL and pre---' ); |
949 | $state->hasIndentPreNowikis = true; |
950 | return $this->escapedText( $state, $sol, $text ); |
951 | } |
952 | |
953 | // escape nowiki tags |
954 | $text = WTSUtils::escapeNowikiTags( $text ); |
955 | |
956 | // Use the tokenizer to see if we have any wikitext tokens |
957 | // |
958 | // Ignores entities |
959 | if ( $hasTildes ) { |
960 | $env->log( 'trace/wt-escape', '---Found tildes---' ); |
961 | return $this->escapedText( $state, $sol, $text ); |
962 | } elseif ( $this->hasWikitextTokens( $state, $sol, $text ) ) { |
963 | $env->log( 'trace/wt-escape', '---Found WT tokens---' ); |
964 | return $this->escapedText( $state, $sol, $text ); |
965 | } elseif ( preg_match( '/[^\[]*\]/', $text ) && |
966 | $this->textCanParseAsLink( $opts['node'], $state, $text ) |
967 | ) { |
968 | // we have an closing bracket, and |
969 | // - the text will get parsed as a link in |
970 | $env->log( 'trace/wt-escape', '---Links: complex single-line test---' ); |
971 | return $this->escapedText( $state, $sol, $text ); |
972 | } elseif ( !empty( $opts['isLastChild'] ) && substr( $text, -1 ) === '=' ) { |
973 | // 1. we have an open heading char, and |
974 | // - text ends in a '=' |
975 | // - text comes from the last child |
976 | preg_match( '/^h(\d)/', DOMCompat::nodeName( $state->currLine->firstNode ), $headingMatch ); |
977 | if ( $headingMatch ) { |
978 | $n = intval( $headingMatch[1] ); |
979 | if ( ( $state->currLine->text . $text )[$n] === '=' ) { |
980 | // The first character after the heading wikitext is/will be a '='. |
981 | // So, the trailing '=' can change semantics if it is not nowikied. |
982 | $env->log( 'trace/wt-escape', '---Heading: complex single-line test---' ); |
983 | return $this->escapedText( $state, $sol, $text ); |
984 | } else { |
985 | return $text; |
986 | } |
987 | } elseif ( strlen( $state->currLine->text ) > 0 && $state->currLine->text[0] === '=' ) { |
988 | $env->log( 'trace/wt-escape', '---Text-as-heading: complex single-line test---' ); |
989 | return $this->escapedText( $state, $sol, $text ); |
990 | } else { |
991 | return $text; |
992 | } |
993 | } else { |
994 | $env->log( 'trace/wt-escape', '---All good!---' ); |
995 | return $text; |
996 | } |
997 | } |
998 | |
999 | /** |
1000 | * @param string $str |
1001 | * @param bool $isLast |
1002 | * @param bool $checkNowiki |
1003 | * @param string &$buf |
1004 | * @param bool &$openNowiki |
1005 | * @param bool $isTemplate |
1006 | * @param bool &$serializeAsNamed |
1007 | * @param array $opts [ 'numPositionalArgs' => int, 'argPositionalIndex' => int, 'type' => string, |
1008 | * 'numArgs' => int, 'argIndex' => int ] |
1009 | */ |
1010 | private static function appendStr( |
1011 | string $str, bool $isLast, bool $checkNowiki, string &$buf, bool &$openNowiki, |
1012 | bool $isTemplate, bool &$serializeAsNamed, array $opts |
1013 | ): void { |
1014 | if ( !$checkNowiki ) { |
1015 | if ( $openNowiki ) { |
1016 | $buf .= '</nowiki>'; |
1017 | $openNowiki = false; |
1018 | } |
1019 | $buf .= $str; |
1020 | return; |
1021 | } |
1022 | |
1023 | // '=' is not allowed in positional parameters. We can either |
1024 | // nowiki escape it or convert the named parameter into a |
1025 | // positional param to avoid the escaping. |
1026 | if ( $isTemplate && !$serializeAsNamed && str_contains( $str, '=' ) ) { |
1027 | // In certain situations, it is better to add a nowiki escape |
1028 | // rather than convert this to a named param. |
1029 | // |
1030 | // Ex: Consider: {{funky-tpl|a|b|c|d|e|f|g|h}} |
1031 | // |
1032 | // If an editor changes 'a' to 'f=oo' and we convert it to |
1033 | // a named param 1=f=oo, we are effectively converting all |
1034 | // the later params into named params as well and we get |
1035 | // {{funky-tpl|1=f=oo|2=b|3=c|...|8=h}} instead of |
1036 | // {{funky-tpl|<nowiki>f=oo</nowiki>|b|c|...|h}} |
1037 | // |
1038 | // The latter is better in this case. This is a real problem |
1039 | // in production. |
1040 | // |
1041 | // For now, we use a simple heuristic below and can be |
1042 | // refined later, if necessary |
1043 | // |
1044 | // 1. Either there were no original positional args |
1045 | // 2. Or, only the last positional arg uses '=' |
1046 | if ( $opts['numPositionalArgs'] === 0 || |
1047 | $opts['numPositionalArgs'] === $opts['argPositionalIndex'] |
1048 | ) { |
1049 | $serializeAsNamed = true; |
1050 | } |
1051 | } |
1052 | |
1053 | // Count how many reasons for nowiki |
1054 | $needNowikiCount = 0; |
1055 | $neededSubstitution = null; |
1056 | // Protect against unmatched pairs of braces and brackets, as they |
1057 | // should never appear in template arguments. |
1058 | $bracketPairStrippedStr = preg_replace( |
1059 | '/\[\[([^\[\]]*)\]\]|\{\{([^\{\}]*)\}\}|-\{([^\{\}]*)\}-/', |
1060 | '_$1_', |
1061 | $str |
1062 | ); |
1063 | if ( preg_match( '/\{\{|\}\}|\[\[|\]\]|-\{/', $bracketPairStrippedStr ) ) { |
1064 | $needNowikiCount++; |
1065 | } |
1066 | if ( $opts['type'] !== 'templatearg' && !$serializeAsNamed && str_contains( $str, '=' ) ) { |
1067 | $needNowikiCount++; |
1068 | } |
1069 | if ( $opts['argIndex'] === $opts['numArgs'] && $isLast && str_ends_with( $str, '}' ) ) { |
1070 | // If this is the last part of the last argument, we need to protect |
1071 | // against an ending }, as it would get confused with the template ending }}. |
1072 | $needNowikiCount++; |
1073 | $neededSubstitution = [ '/(\})$/D', '<nowiki>}</nowiki>' ]; |
1074 | } |
1075 | if ( str_contains( $str, '|' ) ) { |
1076 | // If there's an unprotected |, guard it so it doesn't get confused |
1077 | // with the beginning of a different parameter. |
1078 | $needNowikiCount++; |
1079 | $neededSubstitution = [ '/\|/', '{{!}}' ]; |
1080 | } |
1081 | |
1082 | // Now, if arent' already in a <nowiki> and there's only one reason to |
1083 | // protect, avoid guarding too much text by just substituting. |
1084 | if ( !$openNowiki && $needNowikiCount === 1 && $neededSubstitution ) { |
1085 | $str = preg_replace( $neededSubstitution[0], $neededSubstitution[1], $str ); |
1086 | $needNowikiCount = false; |
1087 | } |
1088 | if ( !$openNowiki && $needNowikiCount ) { |
1089 | $buf .= '<nowiki>'; |
1090 | $openNowiki = true; |
1091 | } |
1092 | if ( !$needNowikiCount && $openNowiki ) { |
1093 | $buf .= '</nowiki>'; |
1094 | $openNowiki = false; |
1095 | } |
1096 | $buf .= $str; |
1097 | } |
1098 | |
1099 | /** |
1100 | * General strategy: |
1101 | * |
1102 | * Tokenize the arg wikitext. Anything that parses as tags |
1103 | * are good and we need not bother with those. Check for harmful |
1104 | * characters `[[]]{{}}` or additionally `=` in positional parameters and escape |
1105 | * those fragments since these characters could change semantics of the entire |
1106 | * template transclusion. |
1107 | * |
1108 | * This function makes a couple of assumptions: |
1109 | * |
1110 | * 1. The tokenizer sets tsr on all non-string tokens. |
1111 | * 2. The tsr on TagTk and EndTagTk corresponds to the |
1112 | * width of the opening and closing wikitext tags and not |
1113 | * the entire DOM range they span in the end. |
1114 | * |
1115 | * @param string $arg |
1116 | * @param array $opts [ 'serializeAsNamed' => bool, 'numPositionalArgs' => int, |
1117 | * 'argPositionalIndex' => int, 'type' => string, 'numArgs' => int, 'argIndex' => int ] |
1118 | * @return array |
1119 | */ |
1120 | public function escapeTplArgWT( string $arg, array $opts ): array { |
1121 | $env = $this->env; |
1122 | $serializeAsNamed = $opts['serializeAsNamed']; |
1123 | $buf = ''; |
1124 | $openNowiki = false; |
1125 | $isTemplate = $opts['type'] === 'template'; |
1126 | |
1127 | $tokens = $this->tokenizeStr( $arg, false ); |
1128 | |
1129 | for ( $i = 0, $n = count( $tokens ); $i < $n; $i++ ) { |
1130 | $t = $tokens[$i]; |
1131 | $last = $i === $n - 1; |
1132 | |
1133 | // For mw:Entity spans, the opening and closing tags have 0 width |
1134 | // and the enclosed content is the decoded entity. Hence the |
1135 | // special case to serialize back the entity's source. |
1136 | if ( $t instanceof TagTk ) { |
1137 | $da = $t->dataParsoid; |
1138 | if ( TokenUtils::matchTypeOf( $t, '#^mw:(Placeholder|Entity)(/|$)#' ) ) { |
1139 | $i += 2; |
1140 | $width = $tokens[$i]->dataParsoid->tsr->end - $da->tsr->start; |
1141 | self::appendStr( |
1142 | substr( $arg, $da->tsr->start, $width ), |
1143 | $last, |
1144 | false, |
1145 | $buf, |
1146 | $openNowiki, |
1147 | $isTemplate, |
1148 | $serializeAsNamed, |
1149 | $opts |
1150 | ); |
1151 | continue; |
1152 | } elseif ( TokenUtils::hasTypeOf( $t, 'mw:Nowiki' ) ) { |
1153 | $i++; |
1154 | while ( $i < $n && |
1155 | ( !$tokens[$i] instanceof EndTagTk || |
1156 | !TokenUtils::hasTypeOf( $tokens[$i], 'mw:Nowiki' ) |
1157 | ) |
1158 | ) { |
1159 | $i++; |
1160 | } |
1161 | if ( $i < $n ) { |
1162 | // After tokenization, we can get here: |
1163 | // * Text explicitly protected by <nowiki> in the parameter. |
1164 | // * Other things that should be protected but weren't |
1165 | // according to the tokenizer. |
1166 | // In template argument, we only need to check for unmatched |
1167 | // braces and brackets pairs (which is done in appendStr), |
1168 | // but only if they weren't explicitly protected in the |
1169 | // passed wikitext. |
1170 | $width = $tokens[$i]->dataParsoid->tsr->end - $da->tsr->start; |
1171 | $substr = substr( $arg, $da->tsr->start, $width ); |
1172 | self::appendStr( |
1173 | $substr, |
1174 | $last, |
1175 | !preg_match( '#<nowiki>[^<]*</nowiki>#', $substr ), |
1176 | $buf, |
1177 | $openNowiki, |
1178 | $isTemplate, |
1179 | $serializeAsNamed, |
1180 | $opts |
1181 | ); |
1182 | } |
1183 | continue; |
1184 | } |
1185 | } |
1186 | |
1187 | switch ( TokenUtils::getTokenType( $t ) ) { |
1188 | case 'TagTk': |
1189 | case 'EndTagTk': |
1190 | case 'NlTk': |
1191 | case 'CommentTk': |
1192 | $da = $t->dataParsoid; |
1193 | if ( empty( $da->tsr ) ) { |
1194 | $errors = [ 'Missing tsr for: ' . PHPUtils::jsonEncode( $t ) ]; |
1195 | $errors[] = 'Arg : ' . PHPUtils::jsonEncode( $arg ); |
1196 | $errors[] = 'Toks: ' . PHPUtils::jsonEncode( $tokens ); |
1197 | $env->log( 'error/html2wt/wtescape', implode( "\n", $errors ) ); |
1198 | // FIXME $da->tsr will be undefined below. |
1199 | // Should we throw an explicit exception here? |
1200 | } |
1201 | self::appendStr( |
1202 | $da->tsr->substr( $arg ), |
1203 | $last, |
1204 | false, |
1205 | $buf, |
1206 | $openNowiki, |
1207 | $isTemplate, |
1208 | $serializeAsNamed, |
1209 | $opts |
1210 | ); |
1211 | break; |
1212 | case 'SelfclosingTagTk': |
1213 | $da = $t->dataParsoid; |
1214 | if ( empty( $da->tsr ) ) { |
1215 | $errors = [ 'Missing tsr for: ' . PHPUtils::jsonEncode( $t ) ]; |
1216 | $errors[] = 'Arg : ' . PHPUtils::jsonEncode( $arg ); |
1217 | $errors[] = 'Toks: ' . PHPUtils::jsonEncode( $tokens ); |
1218 | $env->log( 'error/html2wt/wtescape', implode( "\n", $errors ) ); |
1219 | // FIXME $da->tsr will be undefined below. |
1220 | // Should we throw an explicit exception here? |
1221 | } |
1222 | $tkSrc = $da->tsr->substr( $arg ); |
1223 | // Replace pipe by an entity. This is not completely safe. |
1224 | if ( $t->getName() === 'extlink' || $t->getName() === 'urllink' ) { |
1225 | $tkBits = $this->tokenizer->tokenizeSync( $tkSrc, [ |
1226 | 'startRule' => 'tplarg_or_template_or_bust' |
1227 | ] |
1228 | ); |
1229 | foreach ( $tkBits as $bit ) { |
1230 | if ( $bit instanceof Token ) { |
1231 | self::appendStr( |
1232 | $bit->dataParsoid->src, |
1233 | $last, |
1234 | false, |
1235 | $buf, |
1236 | $openNowiki, |
1237 | $isTemplate, |
1238 | $serializeAsNamed, |
1239 | $opts |
1240 | ); |
1241 | } else { |
1242 | // Convert to a named param w/ the same reasoning |
1243 | // as above for escapeStr, however, here we replace |
1244 | // with an entity to avoid breaking up querystrings |
1245 | // with nowikis. |
1246 | if ( $isTemplate && !$serializeAsNamed && str_contains( $bit, '=' ) ) { |
1247 | if ( $opts['numPositionalArgs'] === 0 |
1248 | || $opts['numPositionalArgs'] === $opts['argIndex'] |
1249 | ) { |
1250 | $serializeAsNamed = true; |
1251 | } else { |
1252 | $bit = str_replace( '=', '=', $bit ); |
1253 | } |
1254 | } |
1255 | $buf .= str_replace( '|', '|', $bit ); |
1256 | } |
1257 | } |
1258 | } else { |
1259 | self::appendStr( |
1260 | $tkSrc, |
1261 | $last, |
1262 | false, |
1263 | $buf, |
1264 | $openNowiki, |
1265 | $isTemplate, |
1266 | $serializeAsNamed, |
1267 | $opts |
1268 | ); |
1269 | } |
1270 | break; |
1271 | case 'string': |
1272 | self::appendStr( |
1273 | $t, |
1274 | $last, |
1275 | true, |
1276 | $buf, |
1277 | $openNowiki, |
1278 | $isTemplate, |
1279 | $serializeAsNamed, |
1280 | $opts |
1281 | ); |
1282 | break; |
1283 | case 'EOFTk': |
1284 | break; |
1285 | } |
1286 | } |
1287 | |
1288 | // If nowiki still open, close it now. |
1289 | if ( $openNowiki ) { |
1290 | $buf .= '</nowiki>'; |
1291 | } |
1292 | |
1293 | return [ 'serializeAsNamed' => $serializeAsNamed, 'v' => $buf ]; |
1294 | } |
1295 | |
1296 | /** |
1297 | * See also `escapeLinkTarget` in LinkHandler.php |
1298 | * |
1299 | * @param SerializerState $state |
1300 | * @param string $str |
1301 | * @param bool $solState |
1302 | * @param Node $node |
1303 | * @param bool $isMedia |
1304 | * @return string |
1305 | */ |
1306 | public function escapeLinkContent( |
1307 | SerializerState $state, string $str, bool $solState, Node $node, bool $isMedia |
1308 | ): string { |
1309 | // Entity-escape the content. |
1310 | $str = Utils::escapeWtEntities( $str ); |
1311 | |
1312 | // Wikitext-escape content. |
1313 | $state->onSOL = $solState; |
1314 | $state->wteHandlerStack[] = $isMedia |
1315 | ? [ $this, 'mediaOptionHandler' ] |
1316 | : [ $this, 'wikilinkHandler' ]; |
1317 | $state->inLink = true; |
1318 | $res = $this->escapeWikitext( $state, $str, [ 'node' => $node ] ); |
1319 | $state->inLink = false; |
1320 | array_pop( $state->wteHandlerStack ); |
1321 | |
1322 | return $res; |
1323 | } |
1324 | } |