Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 580 |
|
0.00% |
0 / 22 |
CRAP | |
0.00% |
0 / 1 |
WikitextEscapeHandlers | |
0.00% |
0 / 580 |
|
0.00% |
0 / 22 |
61256 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
startsOnANewLine | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
hasBlocksOnLine | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
110 | |||
hasLeadingEscapableQuoteChar | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
20 | |||
hasTrailingEscapableQuoteChar | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
20 | |||
escapedIBSiblingNodeText | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
30 | |||
isFirstContentNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
liHandler | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
42 | |||
thHandler | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
mediaOptionHandler | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
wikilinkHandler | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
aHandler | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
tdHandler | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
182 | |||
tokenizeStr | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
textCanParseAsLink | |
0.00% |
0 / 52 |
|
0.00% |
0 / 1 |
552 | |||
hasWikitextTokens | |
0.00% |
0 / 67 |
|
0.00% |
0 / 1 |
1640 | |||
nowikiWrap | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
escapedText | |
0.00% |
0 / 78 |
|
0.00% |
0 / 1 |
600 | |||
escapeWikitext | |
0.00% |
0 / 111 |
|
0.00% |
0 / 1 |
2652 | |||
appendStr | |
0.00% |
0 / 37 |
|
0.00% |
0 / 1 |
552 | |||
escapeTplArgWT | |
0.00% |
0 / 127 |
|
0.00% |
0 / 1 |
812 | |||
escapeLinkContent | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\DOM\Element; |
9 | use Wikimedia\Parsoid\DOM\Node; |
10 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
11 | use Wikimedia\Parsoid\Tokens\EOFTk; |
12 | use Wikimedia\Parsoid\Tokens\SelfClosingTagTk; |
13 | use Wikimedia\Parsoid\Tokens\SourceRange; |
14 | use Wikimedia\Parsoid\Tokens\TagTk; |
15 | use Wikimedia\Parsoid\Tokens\Token; |
16 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
17 | use Wikimedia\Parsoid\Utils\DOMCompat; |
18 | use Wikimedia\Parsoid\Utils\DOMUtils; |
19 | use Wikimedia\Parsoid\Utils\PHPUtils; |
20 | use Wikimedia\Parsoid\Utils\TokenUtils; |
21 | use Wikimedia\Parsoid\Utils\Utils; |
22 | use Wikimedia\Parsoid\Utils\WTUtils; |
23 | use Wikimedia\Parsoid\Wikitext\Consts; |
24 | use Wikimedia\Parsoid\Wt2Html\PegTokenizer; |
25 | |
26 | class WikitextEscapeHandlers { |
27 | |
28 | private const LINKS_ESCAPE_RE = '/(\[\[)|(\]\])|(-\{)|(^[^\[]*\]$)/D'; |
29 | |
30 | /** @var Env */ |
31 | private $env; |
32 | |
33 | /** @var ?string */ |
34 | private $extName; |
35 | |
36 | /** |
37 | * @var PegTokenizer |
38 | */ |
39 | private $tokenizer; |
40 | |
41 | public function __construct( Env $env, ?string $extName ) { |
42 | $this->env = $env; |
43 | $this->extName = $extName; |
44 | $this->tokenizer = new PegTokenizer( $env ); |
45 | } |
46 | |
47 | /** |
48 | * Ignore the cases where the serializer adds newlines not present in the dom |
49 | * @param Node $node |
50 | * @return bool |
51 | */ |
52 | private static function startsOnANewLine( Node $node ): bool { |
53 | $name = DOMCompat::nodeName( $node ); |
54 | return TokenUtils::tagOpensBlockScope( $name ) && |
55 | !WTUtils::isLiteralHTMLNode( $node ); |
56 | } |
57 | |
58 | /** |
59 | * Look ahead on current line for block content |
60 | * |
61 | * @param Node $node |
62 | * @param bool $first |
63 | * @return bool |
64 | */ |
65 | private static function hasBlocksOnLine( Node $node, bool $first ): bool { |
66 | // special case for firstNode: |
67 | // we're at sol so ignore possible \n at first char |
68 | if ( $first ) { |
69 | $textContent = $node->textContent; |
70 | $offset = strlen( $textContent ) ? 1 : 0; |
71 | if ( strpos( $textContent, "\n", $offset ) !== false ) { |
72 | return false; |
73 | } |
74 | $node = $node->nextSibling; |
75 | } |
76 | |
77 | while ( $node ) { |
78 | if ( $node instanceof Element ) { |
79 | if ( DOMUtils::isWikitextBlockNode( $node ) ) { |
80 | return !self::startsOnANewLine( $node ); |
81 | } |
82 | if ( $node->hasChildNodes() ) { |
83 | if ( self::hasBlocksOnLine( $node->firstChild, false ) ) { |
84 | return true; |
85 | } |
86 | } |
87 | } else { |
88 | if ( str_contains( $node->textContent, "\n" ) ) { |
89 | return false; |
90 | } |
91 | } |
92 | $node = $node->nextSibling; |
93 | } |
94 | return false; |
95 | } |
96 | |
97 | /** |
98 | * @param string $text |
99 | * @param array $opts [ 'node' => Node ] |
100 | * @return bool |
101 | */ |
102 | private static function hasLeadingEscapableQuoteChar( string $text, array $opts ): bool { |
103 | /** @var Node $node */ |
104 | $node = $opts['node']; |
105 | // Use 'node.textContent' to do the tests since it hasn't had newlines |
106 | // stripped out from it. |
107 | // Ex: For this DOM: <i>x</i>\n'\n<i>y</i> |
108 | // node.textContent = \n'\n and text = ' |
109 | // Those newline separators can prevent unnecessary <nowiki/> protection |
110 | // if the string begins with one or more newlines before a leading quote. |
111 | $origText = $node->textContent; |
112 | if ( substr( $origText, 0, 1 ) === "'" ) { |
113 | $prev = DiffDOMUtils::previousNonDeletedSibling( $node ); |
114 | if ( !$prev ) { |
115 | $prev = $node->parentNode; |
116 | } |
117 | if ( DOMUtils::isQuoteElt( $prev ) ) { |
118 | return true; |
119 | } |
120 | } |
121 | |
122 | return false; |
123 | } |
124 | |
125 | /** |
126 | * @param string $text |
127 | * @param array $opts [ 'node' => Node ] |
128 | * @return bool |
129 | */ |
130 | private static function hasTrailingEscapableQuoteChar( string $text, array $opts ): bool { |
131 | $node = $opts['node']; |
132 | // Use 'node.textContent' to do the tests since it hasn't had newlines |
133 | // stripped out from it. |
134 | // Ex: For this DOM: <i>x</i>\n'\n<i>y</i> |
135 | // node.textContent = \n'\n and text = ' |
136 | // Those newline separators can prevent unnecessary <nowiki/> protection |
137 | // if the string ends with a trailing quote and then one or more newlines. |
138 | $origText = $node->textContent; |
139 | if ( substr( $origText, -1 ) === "'" ) { |
140 | $next = DiffDOMUtils::nextNonDeletedSibling( $node ); |
141 | if ( !$next ) { |
142 | $next = $node->parentNode; |
143 | } |
144 | if ( DOMUtils::isQuoteElt( $next ) ) { |
145 | return true; |
146 | } |
147 | } |
148 | |
149 | return false; |
150 | } |
151 | |
152 | /** |
153 | * SSS FIXME: By doing a DOM walkahead to identify what else is on the current line, |
154 | * these heuristics can be improved. Ex: '<i>foo</i> blah blah does not require a |
155 | * <nowiki/> after the single quote since we know that there are no other quotes on |
156 | * the rest of the line that will be emitted. Similarly, '' does not need a <nowiki> |
157 | * wrapper since there are on other quote chars on the line. |
158 | * |
159 | * This is checking text-node siblings of i/b tags. |
160 | * |
161 | * @param SerializerState $state |
162 | * @param string $text |
163 | * @param array $opts [ 'node' => Node ] |
164 | * @return string |
165 | */ |
166 | private static function escapedIBSiblingNodeText( |
167 | SerializerState $state, string $text, array $opts |
168 | ): string { |
169 | // For a sequence of 2+ quote chars, we have to |
170 | // fully wrap the sequence in <nowiki>...</nowiki> |
171 | // <nowiki/> at the start and end doesn't work. |
172 | // |
173 | // Ex: ''<i>foo</i> should serialize to <nowiki>''</nowiki>''foo''. |
174 | // |
175 | // Serializing it to ''<nowiki/>''foo'' breaks html2html semantics |
176 | // since it will parse back to <i><meta../></i>foo<i></i> |
177 | if ( preg_match( "/''+/", $text ) ) { |
178 | // Minimize the length of the string that is wrapped in <nowiki>. |
179 | $pieces = explode( "'", $text ); |
180 | $first = array_shift( $pieces ); |
181 | $last = array_pop( $pieces ); |
182 | return $first . "<nowiki>'" . implode( "'", $pieces ) . "'</nowiki>" . $last; |
183 | } |
184 | |
185 | // Check whether the head and/or tail of the text needs <nowiki/> protection. |
186 | $out = ''; |
187 | if ( self::hasTrailingEscapableQuoteChar( $text, $opts ) ) { |
188 | $state->hasQuoteNowikis = true; |
189 | $out = $text . '<nowiki/>'; |
190 | } |
191 | |
192 | if ( self::hasLeadingEscapableQuoteChar( $text, $opts ) ) { |
193 | $state->hasQuoteNowikis = true; |
194 | $out = '<nowiki/>' . ( $out ?: $text ); |
195 | } |
196 | |
197 | return $out; |
198 | } |
199 | |
200 | public function isFirstContentNode( Node $node ): bool { |
201 | // Skip deleted-node markers |
202 | return DiffDOMUtils::previousNonDeletedSibling( $node ) === null; |
203 | } |
204 | |
205 | /** |
206 | * @param Node $liNode |
207 | * @param SerializerState $state |
208 | * @param string $text |
209 | * @param array $opts [ 'node' => Node ] |
210 | * @return bool |
211 | */ |
212 | public function liHandler( |
213 | Node $liNode, SerializerState $state, string $text, array $opts |
214 | ): bool { |
215 | /** @var Node $node */ |
216 | $node = $opts['node']; |
217 | if ( $node->parentNode !== $liNode ) { |
218 | return false; |
219 | } |
220 | |
221 | // For <dt> nodes, ":" trigger nowiki outside of elements |
222 | // For first nodes of <li>'s, bullets in sol posn trigger escaping |
223 | if ( DOMCompat::nodeName( $liNode ) === 'dt' && str_contains( $text, ':' ) ) { |
224 | return true; |
225 | } elseif ( preg_match( '/^[#*:;]*$/D', $state->currLine->text ) && |
226 | $this->isFirstContentNode( $node ) |
227 | ) { |
228 | // Wikitext styling might require whitespace insertion after list bullets. |
229 | // In those scenarios, presence of bullet-wiktext in the text node is okay. |
230 | // Hence the check for /^[#*:;]*$/ above. |
231 | return (bool)strspn( $text, '#*:;', 0, 1 ); |
232 | } else { |
233 | return false; |
234 | } |
235 | } |
236 | |
237 | /** |
238 | * @param Node $thNode |
239 | * @param SerializerState $state |
240 | * @param string $text |
241 | * @param array $opts [ 'node' => Node ] |
242 | * @return bool |
243 | */ |
244 | public function thHandler( |
245 | Node $thNode, SerializerState $state, string $text, array $opts |
246 | ): bool { |
247 | // {| |
248 | // !a<div>!!b</div> |
249 | // !c<div>||d</div> |
250 | // |} |
251 | // |
252 | // The <div> will get split across two <th> tags because |
253 | // the !! and | has higher precedence in the tokenizer. |
254 | // |
255 | // So, no matter where in the DOM subtree of the <th> node |
256 | // that text shows up in, we have to unconditionally escape |
257 | // the !! and | characters. |
258 | // |
259 | // That is, so long as it serializes to the same line as the |
260 | // heading was started. |
261 | return preg_match( '/^\s*!/', $state->currLine->text ) && |
262 | preg_match( '/^[^\n]*!!|\|/', $text ); |
263 | } |
264 | |
265 | public function mediaOptionHandler( SerializerState $state, string $text ): bool { |
266 | return str_contains( $text, '|' ) || preg_match( self::LINKS_ESCAPE_RE, $text ); |
267 | } |
268 | |
269 | public function wikilinkHandler( SerializerState $state, string $text ): bool { |
270 | return (bool)preg_match( self::LINKS_ESCAPE_RE, $text ); |
271 | } |
272 | |
273 | public function aHandler( SerializerState $state, string $text ): bool { |
274 | return str_contains( $text, ']' ); |
275 | } |
276 | |
277 | /** |
278 | * @param Node $tdNode |
279 | * @param bool $inWideTD |
280 | * @param SerializerState $state |
281 | * @param string $text |
282 | * @param array $opts [ 'node' => ?Node ] |
283 | * @return bool |
284 | */ |
285 | public function tdHandler( |
286 | Node $tdNode, bool $inWideTD, SerializerState $state, string $text, array $opts |
287 | ): bool { |
288 | $node = $opts['node'] ?? null; |
289 | /* |
290 | * "|" anywhere in a text node of the <td> subtree can be trouble! |
291 | * It is not sufficient to just look at immediate child of <td> |
292 | * Try parsing the following table: |
293 | * |
294 | * {| |
295 | * |a''b|c'' |
296 | * |} |
297 | * |
298 | * Similarly, "-" or "+" when emitted after a "|" in sol position |
299 | * is trouble, but in addition to showing up as the immediate first |
300 | * child of tdNode, they can appear on the leftmost path from |
301 | * tdNode as long as the path only has nodes don't emit any wikitext. |
302 | * Ex: <td><p>-</p></td>, but not: <td><small>-</small></td> |
303 | */ |
304 | |
305 | // If 'text' is on the same wikitext line as the "|" corresponding |
306 | // to the <td> |
307 | // * | in a td should be escaped |
308 | // * +-} in SOL position (if they show up on the leftmost path with |
309 | // only zero-wt-emitting nodes on that path) |
310 | if ( !$node || $state->currLine->firstNode === $tdNode ) { |
311 | if ( str_contains( $text, '|' ) ) { |
312 | return true; |
313 | } |
314 | if ( !$inWideTD && |
315 | $state->currLine->text === '|' && |
316 | strspn( $text, '-+}', 0, 1 ) && |
317 | $node |
318 | ) { |
319 | $n = $node; |
320 | while ( $n && $n !== $tdNode ) { |
321 | if ( !$this->isFirstContentNode( $n ) || |
322 | !( $n === $node || WTUtils::isZeroWidthWikitextElt( $n ) ) ) { |
323 | return false; |
324 | } |
325 | $n = $n->parentNode; |
326 | } |
327 | return true; |
328 | } |
329 | } |
330 | return false; |
331 | } |
332 | |
333 | /** |
334 | * Tokenize string and pop EOFTk |
335 | * |
336 | * @param string $str |
337 | * @param bool $sol |
338 | * @return array |
339 | */ |
340 | public function tokenizeStr( string $str, bool $sol ): array { |
341 | $tokens = $this->tokenizer->tokenizeSync( $str, [ 'sol' => $sol ] ); |
342 | Assert::invariant( |
343 | array_pop( $tokens ) instanceof EOFTk, |
344 | 'Expected EOF token!' |
345 | ); |
346 | return $tokens; |
347 | } |
348 | |
349 | public function textCanParseAsLink( Node $node, SerializerState $state, string $text ): bool { |
350 | $env = $state->getEnv(); |
351 | $env->log( |
352 | 'trace/wt-escape', 'link-test-text=', |
353 | static function () use ( $text ) { |
354 | return PHPUtils::jsonEncode( $text ); |
355 | } |
356 | ); |
357 | |
358 | // Strip away extraneous characters after a ]] or a ] |
359 | // They are inessential to the test of whether the ]]/] |
360 | // will get parsed into a wikilink and only complicate |
361 | // the logic (needing to ignore entities, etc.). |
362 | $text = preg_replace( '/\][^\]]*$/D', ']', $text, 1 ); |
363 | |
364 | // text only contains ']' chars. |
365 | // Since we stripped everything after ']' above, if a newline is |
366 | // present, a link would have to straddle newlines which is not valid. |
367 | if ( str_contains( $text, "\n" ) ) { |
368 | return false; |
369 | } |
370 | |
371 | $str = $state->currLine->text . $text; |
372 | $tokens = $this->tokenizeStr( $str, false ); // sol state is irrelevant here |
373 | $n = count( $tokens ); |
374 | $lastToken = $tokens[$n - 1]; |
375 | |
376 | $env->log( 'trace/wt-escape', 'str=', $str, ';tokens=', $tokens ); |
377 | |
378 | // If 'text' remained outside of any non-string tokens, |
379 | // it does not need nowiking. |
380 | if ( $lastToken === $text || |
381 | ( is_string( $lastToken ) && |
382 | $text === substr( $lastToken, -strlen( $text ) ) |
383 | ) |
384 | ) { |
385 | return false; |
386 | } |
387 | |
388 | // Verify that the tokenized links are valid links |
389 | $buf = ''; |
390 | for ( $i = $n - 1; $i >= 0; $i-- ) { |
391 | $t = $tokens[$i]; |
392 | if ( is_string( $t ) ) { |
393 | $buf = $t . $buf; |
394 | } elseif ( $t->getName() === 'wikilink' ) { |
395 | $target = $t->getAttributeV( 'href' ); |
396 | if ( is_array( $target ) ) { |
397 | // FIXME: in theory template expansion *could* make this a link. |
398 | return false; |
399 | } |
400 | if ( $env->isValidLinkTarget( $target ) && |
401 | !$env->getSiteConfig()->hasValidProtocol( $target ) |
402 | ) { |
403 | return true; |
404 | } |
405 | |
406 | // Assumes 'src' will always be present which it seems to be. |
407 | // Tests will fail if anything changes in the tokenizer. |
408 | $buf = $t->dataParsoid->src . $buf; |
409 | } elseif ( $t->getName() === 'extlink' ) { |
410 | // Check if the extlink came from a template which in the end |
411 | // would not really parse as an extlink. |
412 | |
413 | $href = $t->getAttributeV( 'href' ); |
414 | if ( is_array( $href ) ) { |
415 | $href = $href[0]; |
416 | } |
417 | |
418 | if ( !TokenUtils::isTemplateToken( $href ) ) { |
419 | // Not a template and a real href => needs nowiking |
420 | if ( is_string( $href ) && preg_match( '#https?://#', $href ) ) { |
421 | return true; |
422 | } |
423 | } else { |
424 | while ( $node ) { |
425 | $node = DiffDOMUtils::previousNonSepSibling( $node ); |
426 | if ( $node && WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
427 | // FIXME: This is not entirely correct. |
428 | // Assumes that extlink content doesn't have templates. |
429 | // Solution: Count # of non-nested templates encountered |
430 | // and skip over intermediate templates. |
431 | // var content = t.getAttribute('mw:content'); |
432 | // var n = intermediateNonNestedTemplates(content); |
433 | break; |
434 | } |
435 | } |
436 | |
437 | if ( $node instanceof Element && DOMCompat::nodeName( $node ) === 'a' && |
438 | $node->textContent === DOMCompat::getAttribute( $node, 'href' ) |
439 | ) { |
440 | // The template expands to an url link => needs nowiking |
441 | return true; |
442 | } |
443 | } |
444 | |
445 | // Since this will not parse to a real extlink, |
446 | // update buf with the wikitext src for this token. |
447 | $tsr = $t->dataParsoid->tsr; |
448 | $buf = $tsr->substr( $str ) . $buf; |
449 | } else { |
450 | // We have no other smarts => be conservative. |
451 | return true; |
452 | } |
453 | |
454 | if ( $text === substr( $buf, -strlen( $text ) ) ) { |
455 | // 'text' emerged unscathed |
456 | return false; |
457 | } |
458 | } |
459 | |
460 | // We couldn't prove safety of skipping nowiki-ing. |
461 | return true; |
462 | } |
463 | |
464 | private function hasWikitextTokens( |
465 | SerializerState $state, bool $onNewline, string $text |
466 | ): bool { |
467 | $env = $state->getEnv(); |
468 | $env->log( |
469 | 'trace/wt-escape', 'nl:', $onNewline, ':text=', |
470 | static function () use ( $text ) { |
471 | return PHPUtils::jsonEncode( $text ); |
472 | } |
473 | ); |
474 | |
475 | // tokenize the text |
476 | $sol = $onNewline && !( $state->inIndentPre || $state->inPHPBlock ); |
477 | |
478 | // If we're inside a <pre>, we need to add an extra space after every |
479 | // newline so that the tokenizer correctly parses all tokens in a pre |
480 | // instead of just the first one. See T95794. |
481 | if ( $state->inIndentPre ) { |
482 | $text = str_replace( "\n", "\n ", $text ); |
483 | } |
484 | |
485 | $tokens = $this->tokenizeStr( $text, $sol ); |
486 | |
487 | // If the token stream has a TagTk, SelfclosingTagTk, EndTagTk or CommentTk |
488 | // then this text needs escaping! |
489 | $numEntities = 0; |
490 | foreach ( $tokens as $t ) { |
491 | $env->log( |
492 | 'trace/wt-escape', 'T:', |
493 | static function () use ( $t ) { |
494 | return PHPUtils::jsonEncode( $t ); |
495 | } |
496 | ); |
497 | |
498 | // Ignore html tags that aren't allowed as literals in wikitext |
499 | if ( TokenUtils::isHTMLTag( $t ) ) { |
500 | if ( |
501 | TokenUtils::matchTypeOf( $t, '#^mw:Extension(/|$)#' ) && |
502 | ( $this->extName !== $t->getAttributeV( 'name' ) ) |
503 | ) { |
504 | return true; |
505 | } |
506 | |
507 | // Always escape isolated extension tags (T59469). Consider this: |
508 | // echo "<ref>foo<p></ref></p>" | node parse --html2wt |
509 | // The <ref> and </ref> tag-like text is spread across the DOM, and in |
510 | // the worst case can be anywhere. So, we conservatively escape these |
511 | // elements always (which can lead to excessive nowiki-escapes in some |
512 | // cases, but is always safe). |
513 | if ( ( $t instanceof TagTk || $t instanceof EndTagTk ) && |
514 | $env->getSiteConfig()->isExtensionTag( mb_strtolower( $t->getName() ) ) |
515 | ) { |
516 | return true; |
517 | } |
518 | |
519 | // If the tag is one that's allowed in wikitext, we need to escape |
520 | // it inside <nowiki>s, because a text node nodeValue always returns |
521 | // non-escaped entities (that is, converts "<h2>" to "<h2>"). |
522 | // TODO: We should also do this for <a> tags because even if they |
523 | // aren't allowed in wikitext and thus don't need to be escaped, the |
524 | // result can be confusing for editors. However, doing it here in a |
525 | // simple way interacts badly with normal link escaping, so it's |
526 | // left for later. |
527 | if ( isset( Consts::$Sanitizer['AllowedLiteralTags'][mb_strtolower( $t->getName() )] ) ) { |
528 | return true; |
529 | } else { |
530 | continue; |
531 | } |
532 | } |
533 | |
534 | if ( $t instanceof SelfclosingTagTk ) { |
535 | // * Ignore RFC/ISBN/PMID tokens when those are encountered in the |
536 | // context of another link's content -- those are not parsed to |
537 | // ext-links in that context. (T109371) |
538 | if ( ( $t->getName() === 'extlink' || $t->getName() === 'wikilink' ) && |
539 | ( $t->dataParsoid->stx ?? null ) === 'magiclink' && |
540 | ( $state->inAttribute || $state->inLink ) ) { |
541 | continue; |
542 | } |
543 | |
544 | // Ignore url links in attributes (href, mostly) |
545 | // since they are not in danger of being autolink-ified there. |
546 | if ( $t->getName() === 'urllink' && ( $state->inAttribute || $state->inLink ) ) { |
547 | continue; |
548 | } |
549 | |
550 | if ( $t->getName() === 'wikilink' ) { |
551 | if ( $env->isValidLinkTarget( $t->getAttributeV( 'href' ) ?? '' ) ) { |
552 | return true; |
553 | } else { |
554 | continue; |
555 | } |
556 | } |
557 | |
558 | return true; |
559 | } |
560 | |
561 | if ( $state->inCaption && $t instanceof TagTk && $t->getName() === 'listItem' ) { |
562 | continue; |
563 | } |
564 | |
565 | if ( $t instanceof TagTk ) { |
566 | // Ignore mw:Entity tokens |
567 | if ( $t->getName() === 'span' && TokenUtils::hasTypeOf( $t, 'mw:Entity' ) ) { |
568 | $numEntities++; |
569 | continue; |
570 | } |
571 | |
572 | // Ignore table tokens outside of tables |
573 | if ( in_array( $t->getName(), [ 'caption', 'td', 'tr', 'th' ], true ) && |
574 | !TokenUtils::isHTMLTag( $t ) && |
575 | $state->wikiTableNesting === 0 |
576 | ) { |
577 | continue; |
578 | } |
579 | |
580 | // Headings have both SOL and EOL requirements. This tokenization |
581 | // here only verifies SOL requirements, not EOL requirements. |
582 | // So, record this information so that we can strip unnecessary |
583 | // nowikis after the fact. |
584 | if ( preg_match( '/^h\d$/D', $t->getName() ) ) { |
585 | $state->hasHeadingEscapes = true; |
586 | } |
587 | |
588 | return true; |
589 | } |
590 | |
591 | if ( $t instanceof EndTagTk ) { |
592 | // Ignore mw:Entity tokens |
593 | if ( $numEntities > 0 && $t->getName() === 'span' ) { |
594 | $numEntities--; |
595 | continue; |
596 | } |
597 | // Ignore heading tokens |
598 | if ( preg_match( '/^h\d$/D', $t->getName() ) ) { |
599 | continue; |
600 | } |
601 | |
602 | // Ignore table tokens outside of tables |
603 | if ( isset( ( [ 'caption' => 1, 'table' => 1 ] )[$t->getName()] ) && |
604 | $state->wikiTableNesting === 0 |
605 | ) { |
606 | continue; |
607 | } |
608 | |
609 | // </br>! |
610 | if ( mb_strtolower( $t->getName() ) === 'br' ) { |
611 | continue; |
612 | } |
613 | |
614 | return true; |
615 | } |
616 | } |
617 | |
618 | return false; |
619 | } |
620 | |
621 | private static function nowikiWrap( |
622 | string $str, bool $close, bool &$inNowiki, bool &$nowikisAdded, string &$buf |
623 | ): void { |
624 | if ( !$inNowiki ) { |
625 | $buf .= '<nowiki>'; |
626 | $inNowiki = true; |
627 | $nowikisAdded = true; |
628 | } |
629 | $buf .= $str; |
630 | if ( $close ) { |
631 | $buf .= '</nowiki>'; |
632 | $inNowiki = false; |
633 | } |
634 | } |
635 | |
636 | /** |
637 | * This function attempts to wrap smallest escapable units into |
638 | * nowikis (which can potentially add multiple nowiki pairs in a |
639 | * single string). The idea here is that since this should all be |
640 | * text, anything that tokenizes to another construct needs to be |
641 | * wrapped. |
642 | * |
643 | * Full-wrapping is enabled if the string is being escaped within |
644 | * context-specific handlers where the tokenization context might |
645 | * be different from what we use in this code. |
646 | * |
647 | * @param SerializerState $state |
648 | * @param bool $sol |
649 | * @param string $origText |
650 | * @param bool $fullWrap |
651 | * @param bool $dontWrapIfUnnecessary |
652 | * @return string |
653 | */ |
654 | public function escapedText( |
655 | SerializerState $state, bool $sol, string $origText, |
656 | bool $fullWrap = false, bool $dontWrapIfUnnecessary = false |
657 | ): string { |
658 | Assert::invariant( |
659 | preg_match( '/^(.*?)((?:\r?\n)*)$/sD', $origText, $match ), |
660 | "Escaped text matching failed: {$origText}" |
661 | ); |
662 | |
663 | $text = $match[1]; |
664 | $nls = $match[2]; |
665 | |
666 | if ( $fullWrap ) { |
667 | return '<nowiki>' . $text . '</nowiki>' . $nls; |
668 | } |
669 | |
670 | $buf = ''; |
671 | $inNowiki = false; |
672 | $nowikisAdded = false; |
673 | // These token types don't come with a closing tag |
674 | $tokensWithoutClosingTag = PHPUtils::makeSet( [ 'listItem', 'td', 'tr' ] ); |
675 | |
676 | // reverse escaping nowiki tags |
677 | // we do this so that they tokenize as nowikis |
678 | // instead of entity enclosed text |
679 | $text = preg_replace( '#<(/?nowiki\s*/?\s*)>#i', '<$1>', $text ); |
680 | |
681 | $tokens = $this->tokenizeStr( $text, $sol ); |
682 | |
683 | foreach ( $tokens as $t ) { |
684 | if ( is_string( $t ) ) { |
685 | if ( strlen( $t ) > 0 ) { |
686 | $t = WTSUtils::escapeNowikiTags( $t ); |
687 | if ( !$inNowiki && ( ( $sol && $t[0] === ' ' ) || str_contains( $t, "\n " ) ) ) { |
688 | $x = preg_split( '/(^|\n) /', $t, -1, PREG_SPLIT_DELIM_CAPTURE ); |
689 | $buf .= $x[0]; |
690 | $lastIndexX = count( $x ) - 1; |
691 | for ( $k = 1; $k < $lastIndexX; $k += 2 ) { |
692 | $buf .= $x[$k]; |
693 | if ( $k !== 1 || $x[$k] === "\n" || $sol ) { |
694 | self::nowikiWrap( ' ', true, $inNowiki, $nowikisAdded, $buf ); |
695 | } else { |
696 | $buf .= ' '; |
697 | } |
698 | $buf .= $x[$k + 1]; |
699 | } |
700 | } else { |
701 | $buf .= $t; |
702 | } |
703 | $sol = false; |
704 | } |
705 | continue; |
706 | } |
707 | |
708 | $tsr = $t->dataParsoid->tsr ?? null; |
709 | if ( !( $tsr instanceof SourceRange ) ) { |
710 | $env = $state->getEnv(); |
711 | $env->log( |
712 | 'error/html2wt/escapeNowiki', |
713 | 'Missing tsr for token ', |
714 | PHPUtils::jsonEncode( $t ), |
715 | 'while processing text ', |
716 | $text |
717 | ); |
718 | |
719 | // Bail and wrap the whole thing in a nowiki |
720 | // if we have missing information. |
721 | // Use match[1] since text has been clobbered above. |
722 | return '<nowiki>' . $match[1] . '</nowiki>' . $nls; |
723 | } |
724 | |
725 | // Now put back the escaping we removed above |
726 | $tSrc = WTSUtils::escapeNowikiTags( $tsr->substr( $text ) ); |
727 | switch ( $t->getType() ) { |
728 | case 'NlTk': |
729 | $buf .= $tSrc; |
730 | $sol = true; |
731 | break; |
732 | case 'CommentTk': |
733 | // Comments are sol-transparent |
734 | $buf .= $tSrc; |
735 | break; |
736 | case 'TagTk': |
737 | // Treat tokens with missing tags as self-closing tokens |
738 | // for the purpose of minimal nowiki escaping |
739 | self::nowikiWrap( |
740 | $tSrc, |
741 | isset( $tokensWithoutClosingTag[$t->getName()] ), |
742 | $inNowiki, |
743 | $nowikisAdded, |
744 | $buf |
745 | ); |
746 | $sol = false; |
747 | break; |
748 | case 'EndTagTk': |
749 | self::nowikiWrap( $tSrc, true, $inNowiki, $nowikisAdded, $buf ); |
750 | $sol = false; |
751 | break; |
752 | case 'SelfclosingTagTk': |
753 | if ( $t->getName() !== 'meta' || |
754 | !TokenUtils::hasTypeOf( $t, 'mw:EmptyLine' ) |
755 | ) { |
756 | // Don't bother with marker or empty-line metas |
757 | self::nowikiWrap( $tSrc, true, $inNowiki, $nowikisAdded, $buf ); |
758 | } |
759 | $sol = false; |
760 | break; |
761 | } |
762 | } |
763 | |
764 | // close any unclosed nowikis |
765 | if ( $inNowiki ) { |
766 | $buf .= '</nowiki>'; |
767 | } |
768 | |
769 | // Make sure nowiki is always added |
770 | // Ex: "foo]]" won't tokenize into tags at all |
771 | if ( !$nowikisAdded && !$dontWrapIfUnnecessary ) { |
772 | $buf = ''; |
773 | self::nowikiWrap( $text, true, $inNowiki, $nowikisAdded, $buf ); |
774 | } |
775 | |
776 | $buf .= $nls; |
777 | return $buf; |
778 | } |
779 | |
780 | /** |
781 | * @param SerializerState $state |
782 | * @param string $text |
783 | * @param array $opts [ 'node' => Node, 'inMultilineMode' => ?bool, 'isLastChild' => ?bool ] |
784 | * @return string |
785 | */ |
786 | public function escapeWikitext( SerializerState $state, string $text, array $opts ): string { |
787 | $env = $state->getEnv(); |
788 | $env->log( |
789 | 'trace/wt-escape', 'EWT:', |
790 | static function () use ( $text ) { |
791 | return PHPUtils::jsonEncode( $text ); |
792 | } |
793 | ); |
794 | |
795 | /* ----------------------------------------------------------------- |
796 | * General strategy: If a substring requires escaping, we can escape |
797 | * the entire string without further analysis of the rest of the string. |
798 | * ----------------------------------------------------------------- */ |
799 | |
800 | $hasMagicWord = preg_match( '/(^|\W)(RFC|ISBN|PMID)\s/', $text ); |
801 | $hasAutolink = $env->getSiteConfig()->findValidProtocol( $text ); |
802 | $fullCheckNeeded = !$state->inLink && ( $hasMagicWord || $hasAutolink ); |
803 | $hasQuoteChar = false; |
804 | $indentPreUnsafe = false; |
805 | $hasNonQuoteEscapableChars = false; |
806 | $indentPreSafeMode = $state->inIndentPre || $state->inPHPBlock; |
807 | $sol = $state->onSOL && !$indentPreSafeMode; |
808 | |
809 | // Fast path for special protected characters. |
810 | if ( $state->protect && preg_match( $state->protect, $text ) ) { |
811 | return $this->escapedText( $state, $sol, $text ); |
812 | } |
813 | |
814 | if ( !$fullCheckNeeded ) { |
815 | $hasQuoteChar = str_contains( $text, "'" ); |
816 | $indentPreUnsafe = !$indentPreSafeMode && ( |
817 | preg_match( '/\n +[^\r\n]*?\S+/', $text ) || |
818 | ( $sol && preg_match( '/^ +[^\r\n]*?\S+/', $text ) ) |
819 | ); |
820 | $hasNonQuoteEscapableChars = preg_match( '/[<>\[\]\-\+\|!=#\*:;~{}]|__[^_]*__/', $text ); |
821 | $hasLanguageConverter = preg_match( '/-\{|\}-/', $text ); |
822 | if ( $hasLanguageConverter ) { |
823 | $fullCheckNeeded = true; |
824 | } |
825 | } |
826 | |
827 | // Quick check for the common case (useful to kill a majority of requests) |
828 | // |
829 | // Pure white-space or text without wt-special chars need not be analyzed |
830 | if ( !$fullCheckNeeded && !$hasQuoteChar && !$indentPreUnsafe && !$hasNonQuoteEscapableChars ) { |
831 | $env->log( 'trace/wt-escape', '---No-checks needed---' ); |
832 | return $text; |
833 | } |
834 | |
835 | // Context-specific escape handler |
836 | $wteHandler = PHPUtils::lastItem( $state->wteHandlerStack ); |
837 | if ( $wteHandler && $wteHandler( $state, $text, $opts ) ) { |
838 | $env->log( 'trace/wt-escape', '---Context-specific escape handler---' ); |
839 | return $this->escapedText( $state, false, $text, true ); |
840 | } |
841 | |
842 | // Quote-escape test |
843 | if ( str_contains( $text, "''" ) || |
844 | self::hasLeadingEscapableQuoteChar( $text, $opts ) || |
845 | self::hasTrailingEscapableQuoteChar( $text, $opts ) |
846 | ) { |
847 | // Check if we need full-wrapping <nowiki>..</nowiki> |
848 | // or selective <nowiki/> escaping for quotes. |
849 | if ( $fullCheckNeeded || |
850 | $indentPreUnsafe || |
851 | ( $hasNonQuoteEscapableChars && |
852 | $this->hasWikitextTokens( $state, $sol, $text ) |
853 | ) |
854 | ) { |
855 | $env->log( 'trace/wt-escape', '---quotes: escaping text---' ); |
856 | // If the reason for full wrap is that the text contains non-quote |
857 | // escapable chars, it's still possible to minimize the contents |
858 | // of the <nowiki> (T71950). |
859 | return $this->escapedText( $state, $sol, $text ); |
860 | } else { |
861 | $quoteEscapedText = self::escapedIBSiblingNodeText( $state, $text, $opts ); |
862 | if ( $quoteEscapedText ) { |
863 | $env->log( 'trace/wt-escape', '---sibling of i/b tag---' ); |
864 | return $quoteEscapedText; |
865 | } |
866 | } |
867 | } |
868 | |
869 | // Template and template-arg markers are escaped unconditionally! |
870 | // Conditional escaping requires matching brace pairs and knowledge |
871 | // of whether we are in template arg context or not. |
872 | if ( preg_match( '/\{\{\{|\{\{|\}\}\}|\}\}/', $text ) ) { |
873 | $env->log( 'trace/wt-escape', '---Unconditional: transclusion chars---' ); |
874 | return $this->escapedText( $state, false, $text ); |
875 | } |
876 | |
877 | // Once we eliminate the possibility of multi-line tokens, split the text |
878 | // around newlines and escape each line separately. |
879 | if ( preg_match( '/\n./', $text ) ) { |
880 | $env->log( 'trace/wt-escape', '-- <multi-line-escaping-mode> --' ); |
881 | // We've already processed the full string in a context-specific handler. |
882 | // No more additional processing required. So, push/pop a null handler. |
883 | $state->wteHandlerStack[] = null; |
884 | |
885 | $tmp = []; |
886 | foreach ( explode( "\n", $text ) as $i => $line ) { |
887 | if ( $i > 0 ) { |
888 | // Update state |
889 | $state->onSOL = true; |
890 | $state->currLine->text = ''; |
891 | $opts['inMultilineMode'] = true; |
892 | } |
893 | $tmp[] = $this->escapeWikitext( $state, $line, $opts ); |
894 | } |
895 | $ret = implode( "\n", $tmp ); |
896 | |
897 | array_pop( $state->wteHandlerStack ); |
898 | |
899 | // If nothing changed, check if the original multiline string has |
900 | // any wikitext tokens (ex: multi-line html tags <div\n>foo</div\n>). |
901 | if ( $ret === $text && $this->hasWikitextTokens( $state, $sol, $text ) ) { |
902 | $env->log( 'trace/wt-escape', '---Found multi-line wt tokens---' ); |
903 | $ret = $this->escapedText( $state, $sol, $text ); |
904 | } |
905 | |
906 | $env->log( 'trace/wt-escape', '-- </multi-line-escaping-mode> --' ); |
907 | return $ret; |
908 | } |
909 | |
910 | $env->log( |
911 | 'trace/wt-escape', 'SOL:', $sol, |
912 | static function () use ( $text ) { |
913 | return PHPUtils::jsonEncode( $text ); |
914 | } |
915 | ); |
916 | |
917 | $hasTildes = preg_match( '/~{3,5}/', $text ); |
918 | if ( !$fullCheckNeeded && !$hasTildes ) { |
919 | // {{, {{{, }}}, }} are handled above. |
920 | // Test 1: '', [], <>, __FOO__ need escaping wherever they occur |
921 | // = needs escaping in end-of-line context |
922 | // Test 2: {|, |}, ||, |-, |+, , *#:;, ----, =*= need escaping only in SOL context. |
923 | if ( !$sol && !preg_match( "/''|[<>]|\\[.*\\]|\\]|(=[ ]*(\\n|$))|__[^_]*__/", $text ) ) { |
924 | // It is not necessary to test for an unmatched opening bracket ([) |
925 | // as long as we always escape an unmatched closing bracket (]). |
926 | $env->log( 'trace/wt-escape', '---Not-SOL and safe---' ); |
927 | return $text; |
928 | } |
929 | |
930 | // Quick checks when on a newline |
931 | // + can only occur as "|+" and - can only occur as "|-" or ---- |
932 | if ( $sol && !preg_match( '/(^|\n)[ #*:;=]|[<\[\]>\|\'!]|\-\-\-\-|__[^_]*__/', $text ) ) { |
933 | $env->log( 'trace/wt-escape', '---SOL and safe---' ); |
934 | return $text; |
935 | } |
936 | } |
937 | |
938 | // The front-end parser eliminated pre-tokens in the tokenizer |
939 | // and moved them to a stream handler. So, we always conservatively |
940 | // escape text with ' ' in sol posn with one caveat: |
941 | // * and when the current line has block tokens |
942 | if ( $indentPreUnsafe && |
943 | ( !self::hasBlocksOnLine( $state->currLine->firstNode, true ) || |
944 | !empty( $opts['inMultilineMode'] ) |
945 | ) |
946 | ) { |
947 | $env->log( 'trace/wt-escape', '---SOL and pre---' ); |
948 | $state->hasIndentPreNowikis = true; |
949 | return $this->escapedText( $state, $sol, $text ); |
950 | } |
951 | |
952 | // escape nowiki tags |
953 | $text = WTSUtils::escapeNowikiTags( $text ); |
954 | |
955 | // Use the tokenizer to see if we have any wikitext tokens |
956 | // |
957 | // Ignores entities |
958 | if ( $hasTildes ) { |
959 | $env->log( 'trace/wt-escape', '---Found tildes---' ); |
960 | return $this->escapedText( $state, $sol, $text ); |
961 | } elseif ( $this->hasWikitextTokens( $state, $sol, $text ) ) { |
962 | $env->log( 'trace/wt-escape', '---Found WT tokens---' ); |
963 | return $this->escapedText( $state, $sol, $text ); |
964 | } elseif ( preg_match( '/[^\[]*\]/', $text ) && |
965 | $this->textCanParseAsLink( $opts['node'], $state, $text ) |
966 | ) { |
967 | // we have an closing bracket, and |
968 | // - the text will get parsed as a link in |
969 | $env->log( 'trace/wt-escape', '---Links: complex single-line test---' ); |
970 | return $this->escapedText( $state, $sol, $text ); |
971 | } elseif ( !empty( $opts['isLastChild'] ) && substr( $text, -1 ) === '=' ) { |
972 | // 1. we have an open heading char, and |
973 | // - text ends in a '=' |
974 | // - text comes from the last child |
975 | preg_match( '/^h(\d)/', DOMCompat::nodeName( $state->currLine->firstNode ), $headingMatch ); |
976 | if ( $headingMatch ) { |
977 | $n = intval( $headingMatch[1] ); |
978 | if ( ( $state->currLine->text . $text )[$n] === '=' ) { |
979 | // The first character after the heading wikitext is/will be a '='. |
980 | // So, the trailing '=' can change semantics if it is not nowikied. |
981 | $env->log( 'trace/wt-escape', '---Heading: complex single-line test---' ); |
982 | return $this->escapedText( $state, $sol, $text ); |
983 | } else { |
984 | return $text; |
985 | } |
986 | } elseif ( strlen( $state->currLine->text ) > 0 && $state->currLine->text[0] === '=' ) { |
987 | $env->log( 'trace/wt-escape', '---Text-as-heading: complex single-line test---' ); |
988 | return $this->escapedText( $state, $sol, $text ); |
989 | } else { |
990 | return $text; |
991 | } |
992 | } else { |
993 | $env->log( 'trace/wt-escape', '---All good!---' ); |
994 | return $text; |
995 | } |
996 | } |
997 | |
998 | /** |
999 | * @param string $str |
1000 | * @param bool $isLast |
1001 | * @param bool $checkNowiki |
1002 | * @param string &$buf |
1003 | * @param bool &$openNowiki |
1004 | * @param bool $isTemplate |
1005 | * @param bool &$serializeAsNamed |
1006 | * @param array $opts [ 'numPositionalArgs' => int, 'argPositionalIndex' => int, 'type' => string, |
1007 | * 'numArgs' => int, 'argIndex' => int ] |
1008 | */ |
1009 | private static function appendStr( |
1010 | string $str, bool $isLast, bool $checkNowiki, string &$buf, bool &$openNowiki, |
1011 | bool $isTemplate, bool &$serializeAsNamed, array $opts |
1012 | ): void { |
1013 | if ( !$checkNowiki ) { |
1014 | if ( $openNowiki ) { |
1015 | $buf .= '</nowiki>'; |
1016 | $openNowiki = false; |
1017 | } |
1018 | $buf .= $str; |
1019 | return; |
1020 | } |
1021 | |
1022 | // '=' is not allowed in positional parameters. We can either |
1023 | // nowiki escape it or convert the named parameter into a |
1024 | // positional param to avoid the escaping. |
1025 | if ( $isTemplate && !$serializeAsNamed && str_contains( $str, '=' ) ) { |
1026 | // In certain situations, it is better to add a nowiki escape |
1027 | // rather than convert this to a named param. |
1028 | // |
1029 | // Ex: Consider: {{funky-tpl|a|b|c|d|e|f|g|h}} |
1030 | // |
1031 | // If an editor changes 'a' to 'f=oo' and we convert it to |
1032 | // a named param 1=f=oo, we are effectively converting all |
1033 | // the later params into named params as well and we get |
1034 | // {{funky-tpl|1=f=oo|2=b|3=c|...|8=h}} instead of |
1035 | // {{funky-tpl|<nowiki>f=oo</nowiki>|b|c|...|h}} |
1036 | // |
1037 | // The latter is better in this case. This is a real problem |
1038 | // in production. |
1039 | // |
1040 | // For now, we use a simple heuristic below and can be |
1041 | // refined later, if necessary |
1042 | // |
1043 | // 1. Either there were no original positional args |
1044 | // 2. Or, only the last positional arg uses '=' |
1045 | if ( $opts['numPositionalArgs'] === 0 || |
1046 | $opts['numPositionalArgs'] === $opts['argPositionalIndex'] |
1047 | ) { |
1048 | $serializeAsNamed = true; |
1049 | } |
1050 | } |
1051 | |
1052 | // Count how many reasons for nowiki |
1053 | $needNowikiCount = 0; |
1054 | $neededSubstitution = null; |
1055 | // Protect against unmatched pairs of braces and brackets, as they |
1056 | // should never appear in template arguments. |
1057 | $bracketPairStrippedStr = preg_replace( |
1058 | '/\[\[([^\[\]]*)\]\]|\{\{([^\{\}]*)\}\}|-\{([^\{\}]*)\}-/', |
1059 | '_$1_', |
1060 | $str |
1061 | ); |
1062 | if ( preg_match( '/\{\{|\}\}|\[\[|\]\]|-\{/', $bracketPairStrippedStr ) ) { |
1063 | $needNowikiCount++; |
1064 | } |
1065 | if ( $opts['type'] !== 'templatearg' && !$serializeAsNamed && str_contains( $str, '=' ) ) { |
1066 | $needNowikiCount++; |
1067 | } |
1068 | if ( $opts['argIndex'] === $opts['numArgs'] && $isLast && str_ends_with( $str, '}' ) ) { |
1069 | // If this is the last part of the last argument, we need to protect |
1070 | // against an ending }, as it would get confused with the template ending }}. |
1071 | $needNowikiCount++; |
1072 | $neededSubstitution = [ '/(\})$/D', '<nowiki>}</nowiki>' ]; |
1073 | } |
1074 | if ( str_contains( $str, '|' ) ) { |
1075 | // If there's an unprotected |, guard it so it doesn't get confused |
1076 | // with the beginning of a different parameter. |
1077 | $needNowikiCount++; |
1078 | $neededSubstitution = [ '/\|/', '{{!}}' ]; |
1079 | } |
1080 | |
1081 | // Now, if arent' already in a <nowiki> and there's only one reason to |
1082 | // protect, avoid guarding too much text by just substituting. |
1083 | if ( !$openNowiki && $needNowikiCount === 1 && $neededSubstitution ) { |
1084 | $str = preg_replace( $neededSubstitution[0], $neededSubstitution[1], $str ); |
1085 | $needNowikiCount = false; |
1086 | } |
1087 | if ( !$openNowiki && $needNowikiCount ) { |
1088 | $buf .= '<nowiki>'; |
1089 | $openNowiki = true; |
1090 | } |
1091 | if ( !$needNowikiCount && $openNowiki ) { |
1092 | $buf .= '</nowiki>'; |
1093 | $openNowiki = false; |
1094 | } |
1095 | $buf .= $str; |
1096 | } |
1097 | |
1098 | /** |
1099 | * General strategy: |
1100 | * |
1101 | * Tokenize the arg wikitext. Anything that parses as tags |
1102 | * are good and we need not bother with those. Check for harmful |
1103 | * characters `[[]]{{}}` or additionally `=` in positional parameters and escape |
1104 | * those fragments since these characters could change semantics of the entire |
1105 | * template transclusion. |
1106 | * |
1107 | * This function makes a couple of assumptions: |
1108 | * |
1109 | * 1. The tokenizer sets tsr on all non-string tokens. |
1110 | * 2. The tsr on TagTk and EndTagTk corresponds to the |
1111 | * width of the opening and closing wikitext tags and not |
1112 | * the entire DOM range they span in the end. |
1113 | * |
1114 | * @param string $arg |
1115 | * @param array $opts [ 'serializeAsNamed' => bool, 'numPositionalArgs' => int, |
1116 | * 'argPositionalIndex' => int, 'type' => string, 'numArgs' => int, 'argIndex' => int ] |
1117 | * @return array |
1118 | */ |
1119 | public function escapeTplArgWT( string $arg, array $opts ): array { |
1120 | $env = $this->env; |
1121 | $serializeAsNamed = $opts['serializeAsNamed']; |
1122 | $buf = ''; |
1123 | $openNowiki = false; |
1124 | $isTemplate = $opts['type'] === 'template'; |
1125 | |
1126 | $tokens = $this->tokenizeStr( $arg, false ); |
1127 | |
1128 | for ( $i = 0, $n = count( $tokens ); $i < $n; $i++ ) { |
1129 | $t = $tokens[$i]; |
1130 | $last = $i === $n - 1; |
1131 | |
1132 | // For mw:Entity spans, the opening and closing tags have 0 width |
1133 | // and the enclosed content is the decoded entity. Hence the |
1134 | // special case to serialize back the entity's source. |
1135 | if ( $t instanceof TagTk ) { |
1136 | $da = $t->dataParsoid; |
1137 | if ( TokenUtils::matchTypeOf( $t, '#^mw:(Placeholder|Entity)(/|$)#' ) ) { |
1138 | $i += 2; |
1139 | $width = $tokens[$i]->dataParsoid->tsr->end - $da->tsr->start; |
1140 | self::appendStr( |
1141 | substr( $arg, $da->tsr->start, $width ), |
1142 | $last, |
1143 | false, |
1144 | $buf, |
1145 | $openNowiki, |
1146 | $isTemplate, |
1147 | $serializeAsNamed, |
1148 | $opts |
1149 | ); |
1150 | continue; |
1151 | } elseif ( TokenUtils::hasTypeOf( $t, 'mw:Nowiki' ) ) { |
1152 | $i++; |
1153 | while ( $i < $n && |
1154 | ( !$tokens[$i] instanceof EndTagTk || |
1155 | !TokenUtils::hasTypeOf( $tokens[$i], 'mw:Nowiki' ) |
1156 | ) |
1157 | ) { |
1158 | $i++; |
1159 | } |
1160 | if ( $i < $n ) { |
1161 | // After tokenization, we can get here: |
1162 | // * Text explicitly protected by <nowiki> in the parameter. |
1163 | // * Other things that should be protected but weren't |
1164 | // according to the tokenizer. |
1165 | // In template argument, we only need to check for unmatched |
1166 | // braces and brackets pairs (which is done in appendStr), |
1167 | // but only if they weren't explicitly protected in the |
1168 | // passed wikitext. |
1169 | $width = $tokens[$i]->dataParsoid->tsr->end - $da->tsr->start; |
1170 | $substr = substr( $arg, $da->tsr->start, $width ); |
1171 | self::appendStr( |
1172 | $substr, |
1173 | $last, |
1174 | !preg_match( '#<nowiki>[^<]*</nowiki>#', $substr ), |
1175 | $buf, |
1176 | $openNowiki, |
1177 | $isTemplate, |
1178 | $serializeAsNamed, |
1179 | $opts |
1180 | ); |
1181 | } |
1182 | continue; |
1183 | } |
1184 | } |
1185 | |
1186 | if ( is_string( $t ) ) { |
1187 | self::appendStr( |
1188 | $t, |
1189 | $last, |
1190 | true, |
1191 | $buf, |
1192 | $openNowiki, |
1193 | $isTemplate, |
1194 | $serializeAsNamed, |
1195 | $opts |
1196 | ); |
1197 | continue; |
1198 | } |
1199 | |
1200 | switch ( $t->getType() ) { |
1201 | case 'TagTk': |
1202 | case 'EndTagTk': |
1203 | case 'NlTk': |
1204 | case 'CommentTk': |
1205 | $da = $t->dataParsoid; |
1206 | if ( empty( $da->tsr ) ) { |
1207 | $errors = [ 'Missing tsr for: ' . PHPUtils::jsonEncode( $t ) ]; |
1208 | $errors[] = 'Arg : ' . PHPUtils::jsonEncode( $arg ); |
1209 | $errors[] = 'Toks: ' . PHPUtils::jsonEncode( $tokens ); |
1210 | $env->log( 'error/html2wt/wtescape', implode( "\n", $errors ) ); |
1211 | // FIXME $da->tsr will be undefined below. |
1212 | // Should we throw an explicit exception here? |
1213 | } |
1214 | self::appendStr( |
1215 | $da->tsr->substr( $arg ), |
1216 | $last, |
1217 | false, |
1218 | $buf, |
1219 | $openNowiki, |
1220 | $isTemplate, |
1221 | $serializeAsNamed, |
1222 | $opts |
1223 | ); |
1224 | break; |
1225 | case 'SelfclosingTagTk': |
1226 | $da = $t->dataParsoid; |
1227 | if ( empty( $da->tsr ) ) { |
1228 | $errors = [ 'Missing tsr for: ' . PHPUtils::jsonEncode( $t ) ]; |
1229 | $errors[] = 'Arg : ' . PHPUtils::jsonEncode( $arg ); |
1230 | $errors[] = 'Toks: ' . PHPUtils::jsonEncode( $tokens ); |
1231 | $env->log( 'error/html2wt/wtescape', implode( "\n", $errors ) ); |
1232 | // FIXME $da->tsr will be undefined below. |
1233 | // Should we throw an explicit exception here? |
1234 | } |
1235 | $tkSrc = $da->tsr->substr( $arg ); |
1236 | // Replace pipe by an entity. This is not completely safe. |
1237 | if ( $t->getName() === 'extlink' || $t->getName() === 'urllink' ) { |
1238 | $tkBits = $this->tokenizer->tokenizeSync( $tkSrc, [ |
1239 | 'startRule' => 'tplarg_or_template_or_bust', |
1240 | 'sol' => true, |
1241 | ] ); |
1242 | foreach ( $tkBits as $bit ) { |
1243 | if ( $bit instanceof Token ) { |
1244 | self::appendStr( |
1245 | $bit->dataParsoid->src, |
1246 | $last, |
1247 | false, |
1248 | $buf, |
1249 | $openNowiki, |
1250 | $isTemplate, |
1251 | $serializeAsNamed, |
1252 | $opts |
1253 | ); |
1254 | } else { |
1255 | // Convert to a named param w/ the same reasoning |
1256 | // as above for escapeStr, however, here we replace |
1257 | // with an entity to avoid breaking up querystrings |
1258 | // with nowikis. |
1259 | if ( $isTemplate && !$serializeAsNamed && str_contains( $bit, '=' ) ) { |
1260 | if ( $opts['numPositionalArgs'] === 0 |
1261 | || $opts['numPositionalArgs'] === $opts['argIndex'] |
1262 | ) { |
1263 | $serializeAsNamed = true; |
1264 | } else { |
1265 | $bit = str_replace( '=', '=', $bit ); |
1266 | } |
1267 | } |
1268 | $buf .= str_replace( '|', '|', $bit ); |
1269 | } |
1270 | } |
1271 | } else { |
1272 | self::appendStr( |
1273 | $tkSrc, |
1274 | $last, |
1275 | false, |
1276 | $buf, |
1277 | $openNowiki, |
1278 | $isTemplate, |
1279 | $serializeAsNamed, |
1280 | $opts |
1281 | ); |
1282 | } |
1283 | break; |
1284 | case 'EOFTk': |
1285 | break; |
1286 | } |
1287 | } |
1288 | |
1289 | // If nowiki still open, close it now. |
1290 | if ( $openNowiki ) { |
1291 | $buf .= '</nowiki>'; |
1292 | } |
1293 | |
1294 | return [ 'serializeAsNamed' => $serializeAsNamed, 'v' => $buf ]; |
1295 | } |
1296 | |
1297 | /** |
1298 | * See also `escapeLinkTarget` in LinkHandler.php |
1299 | * |
1300 | * @param SerializerState $state |
1301 | * @param string $str |
1302 | * @param bool $solState |
1303 | * @param Node $node |
1304 | * @param bool $isMedia |
1305 | * @return string |
1306 | */ |
1307 | public function escapeLinkContent( |
1308 | SerializerState $state, string $str, bool $solState, Node $node, bool $isMedia |
1309 | ): string { |
1310 | // Entity-escape the content. |
1311 | $str = Utils::escapeWtEntities( $str ); |
1312 | |
1313 | // Wikitext-escape content. |
1314 | $state->onSOL = $solState; |
1315 | $state->wteHandlerStack[] = $isMedia |
1316 | ? [ $this, 'mediaOptionHandler' ] |
1317 | : [ $this, 'wikilinkHandler' ]; |
1318 | $state->inLink = true; |
1319 | $res = $this->escapeWikitext( $state, $str, [ 'node' => $node ] ); |
1320 | $state->inLink = false; |
1321 | array_pop( $state->wteHandlerStack ); |
1322 | |
1323 | return $res; |
1324 | } |
1325 | } |