Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 573 |
|
0.00% |
0 / 22 |
CRAP | |
0.00% |
0 / 1 |
WikitextEscapeHandlers | |
0.00% |
0 / 573 |
|
0.00% |
0 / 22 |
61256 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
startsOnANewLine | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
hasBlocksOnLine | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
110 | |||
hasLeadingEscapableQuoteChar | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
20 | |||
hasTrailingEscapableQuoteChar | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
20 | |||
escapedIBSiblingNodeText | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
30 | |||
isFirstContentNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
liHandler | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
42 | |||
thHandler | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
mediaOptionHandler | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
wikilinkHandler | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
aHandler | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
tdHandler | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
182 | |||
tokenizeStr | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
textCanParseAsLink | |
0.00% |
0 / 52 |
|
0.00% |
0 / 1 |
552 | |||
hasWikitextTokens | |
0.00% |
0 / 62 |
|
0.00% |
0 / 1 |
1640 | |||
nowikiWrap | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
escapedText | |
0.00% |
0 / 77 |
|
0.00% |
0 / 1 |
600 | |||
escapeWikitext | |
0.00% |
0 / 111 |
|
0.00% |
0 / 1 |
2652 | |||
appendStr | |
0.00% |
0 / 37 |
|
0.00% |
0 / 1 |
552 | |||
escapeTplArgWT | |
0.00% |
0 / 126 |
|
0.00% |
0 / 1 |
812 | |||
escapeLinkContent | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\DOM\Element; |
9 | use Wikimedia\Parsoid\DOM\Node; |
10 | use Wikimedia\Parsoid\Tokens\CommentTk; |
11 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
12 | use Wikimedia\Parsoid\Tokens\EOFTk; |
13 | use Wikimedia\Parsoid\Tokens\NlTk; |
14 | use Wikimedia\Parsoid\Tokens\SelfClosingTagTk; |
15 | use Wikimedia\Parsoid\Tokens\SourceRange; |
16 | use Wikimedia\Parsoid\Tokens\TagTk; |
17 | use Wikimedia\Parsoid\Tokens\Token; |
18 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
19 | use Wikimedia\Parsoid\Utils\DOMCompat; |
20 | use Wikimedia\Parsoid\Utils\DOMUtils; |
21 | use Wikimedia\Parsoid\Utils\PHPUtils; |
22 | use Wikimedia\Parsoid\Utils\TokenUtils; |
23 | use Wikimedia\Parsoid\Utils\Utils; |
24 | use Wikimedia\Parsoid\Utils\WTUtils; |
25 | use Wikimedia\Parsoid\Wikitext\Consts; |
26 | use Wikimedia\Parsoid\Wt2Html\PegTokenizer; |
27 | |
28 | class WikitextEscapeHandlers { |
29 | |
30 | private const LINKS_ESCAPE_RE = '/(\[\[)|(\]\])|(-\{)|(^[^\[]*\]$)/D'; |
31 | |
32 | /** @var Env */ |
33 | private $env; |
34 | |
35 | /** @var ?string */ |
36 | private $extName; |
37 | |
38 | /** |
39 | * @var PegTokenizer |
40 | */ |
41 | private $tokenizer; |
42 | |
43 | public function __construct( Env $env, ?string $extName ) { |
44 | $this->env = $env; |
45 | $this->extName = $extName; |
46 | $this->tokenizer = new PegTokenizer( $env ); |
47 | } |
48 | |
49 | /** |
50 | * Ignore the cases where the serializer adds newlines not present in the dom |
51 | * @param Node $node |
52 | * @return bool |
53 | */ |
54 | private static function startsOnANewLine( Node $node ): bool { |
55 | $name = DOMCompat::nodeName( $node ); |
56 | return TokenUtils::tagOpensBlockScope( $name ) && |
57 | !WTUtils::isLiteralHTMLNode( $node ); |
58 | } |
59 | |
60 | /** |
61 | * Look ahead on current line for block content |
62 | * |
63 | * @param Node $node |
64 | * @param bool $first |
65 | * @return bool |
66 | */ |
67 | private static function hasBlocksOnLine( Node $node, bool $first ): bool { |
68 | // special case for firstNode: |
69 | // we're at sol so ignore possible \n at first char |
70 | if ( $first ) { |
71 | $textContent = $node->textContent; |
72 | $offset = strlen( $textContent ) ? 1 : 0; |
73 | if ( strpos( $textContent, "\n", $offset ) !== false ) { |
74 | return false; |
75 | } |
76 | $node = $node->nextSibling; |
77 | } |
78 | |
79 | while ( $node ) { |
80 | if ( $node instanceof Element ) { |
81 | if ( DOMUtils::isWikitextBlockNode( $node ) ) { |
82 | return !self::startsOnANewLine( $node ); |
83 | } |
84 | if ( $node->hasChildNodes() ) { |
85 | if ( self::hasBlocksOnLine( $node->firstChild, false ) ) { |
86 | return true; |
87 | } |
88 | } |
89 | } else { |
90 | if ( str_contains( $node->textContent, "\n" ) ) { |
91 | return false; |
92 | } |
93 | } |
94 | $node = $node->nextSibling; |
95 | } |
96 | return false; |
97 | } |
98 | |
99 | /** |
100 | * @param string $text |
101 | * @param array $opts [ 'node' => Node ] |
102 | * @return bool |
103 | */ |
104 | private static function hasLeadingEscapableQuoteChar( string $text, array $opts ): bool { |
105 | /** @var Node $node */ |
106 | $node = $opts['node']; |
107 | // Use 'node.textContent' to do the tests since it hasn't had newlines |
108 | // stripped out from it. |
109 | // Ex: For this DOM: <i>x</i>\n'\n<i>y</i> |
110 | // node.textContent = \n'\n and text = ' |
111 | // Those newline separators can prevent unnecessary <nowiki/> protection |
112 | // if the string begins with one or more newlines before a leading quote. |
113 | $origText = $node->textContent; |
114 | if ( substr( $origText, 0, 1 ) === "'" ) { |
115 | $prev = DiffDOMUtils::previousNonDeletedSibling( $node ); |
116 | if ( !$prev ) { |
117 | $prev = $node->parentNode; |
118 | } |
119 | if ( DOMUtils::isQuoteElt( $prev ) ) { |
120 | return true; |
121 | } |
122 | } |
123 | |
124 | return false; |
125 | } |
126 | |
127 | /** |
128 | * @param string $text |
129 | * @param array $opts [ 'node' => Node ] |
130 | * @return bool |
131 | */ |
132 | private static function hasTrailingEscapableQuoteChar( string $text, array $opts ): bool { |
133 | $node = $opts['node']; |
134 | // Use 'node.textContent' to do the tests since it hasn't had newlines |
135 | // stripped out from it. |
136 | // Ex: For this DOM: <i>x</i>\n'\n<i>y</i> |
137 | // node.textContent = \n'\n and text = ' |
138 | // Those newline separators can prevent unnecessary <nowiki/> protection |
139 | // if the string ends with a trailing quote and then one or more newlines. |
140 | $origText = $node->textContent; |
141 | if ( substr( $origText, -1 ) === "'" ) { |
142 | $next = DiffDOMUtils::nextNonDeletedSibling( $node ); |
143 | if ( !$next ) { |
144 | $next = $node->parentNode; |
145 | } |
146 | if ( DOMUtils::isQuoteElt( $next ) ) { |
147 | return true; |
148 | } |
149 | } |
150 | |
151 | return false; |
152 | } |
153 | |
154 | /** |
155 | * SSS FIXME: By doing a DOM walkahead to identify what else is on the current line, |
156 | * these heuristics can be improved. Ex: '<i>foo</i> blah blah does not require a |
157 | * <nowiki/> after the single quote since we know that there are no other quotes on |
158 | * the rest of the line that will be emitted. Similarly, '' does not need a <nowiki> |
159 | * wrapper since there are on other quote chars on the line. |
160 | * |
161 | * This is checking text-node siblings of i/b tags. |
162 | * |
163 | * @param SerializerState $state |
164 | * @param string $text |
165 | * @param array $opts [ 'node' => Node ] |
166 | * @return string |
167 | */ |
168 | private static function escapedIBSiblingNodeText( |
169 | SerializerState $state, string $text, array $opts |
170 | ): string { |
171 | // For a sequence of 2+ quote chars, we have to |
172 | // fully wrap the sequence in <nowiki>...</nowiki> |
173 | // <nowiki/> at the start and end doesn't work. |
174 | // |
175 | // Ex: ''<i>foo</i> should serialize to <nowiki>''</nowiki>''foo''. |
176 | // |
177 | // Serializing it to ''<nowiki/>''foo'' breaks html2html semantics |
178 | // since it will parse back to <i><meta../></i>foo<i></i> |
179 | if ( preg_match( "/''+/", $text ) ) { |
180 | // Minimize the length of the string that is wrapped in <nowiki>. |
181 | $pieces = explode( "'", $text ); |
182 | $first = array_shift( $pieces ); |
183 | $last = array_pop( $pieces ); |
184 | return $first . "<nowiki>'" . implode( "'", $pieces ) . "'</nowiki>" . $last; |
185 | } |
186 | |
187 | // Check whether the head and/or tail of the text needs <nowiki/> protection. |
188 | $out = ''; |
189 | if ( self::hasTrailingEscapableQuoteChar( $text, $opts ) ) { |
190 | $state->hasQuoteNowikis = true; |
191 | $out = $text . '<nowiki/>'; |
192 | } |
193 | |
194 | if ( self::hasLeadingEscapableQuoteChar( $text, $opts ) ) { |
195 | $state->hasQuoteNowikis = true; |
196 | $out = '<nowiki/>' . ( $out ?: $text ); |
197 | } |
198 | |
199 | return $out; |
200 | } |
201 | |
202 | public function isFirstContentNode( Node $node ): bool { |
203 | // Skip deleted-node markers |
204 | return DiffDOMUtils::previousNonDeletedSibling( $node ) === null; |
205 | } |
206 | |
207 | /** |
208 | * @param Node $liNode |
209 | * @param SerializerState $state |
210 | * @param string $text |
211 | * @param array $opts [ 'node' => Node ] |
212 | * @return bool |
213 | */ |
214 | public function liHandler( |
215 | Node $liNode, SerializerState $state, string $text, array $opts |
216 | ): bool { |
217 | /** @var Node $node */ |
218 | $node = $opts['node']; |
219 | if ( $node->parentNode !== $liNode ) { |
220 | return false; |
221 | } |
222 | |
223 | // For <dt> nodes, ":" trigger nowiki outside of elements |
224 | // For first nodes of <li>'s, bullets in sol posn trigger escaping |
225 | if ( DOMCompat::nodeName( $liNode ) === 'dt' && str_contains( $text, ':' ) ) { |
226 | return true; |
227 | } elseif ( preg_match( '/^[#*:;]*$/D', $state->currLine->text ) && |
228 | $this->isFirstContentNode( $node ) |
229 | ) { |
230 | // Wikitext styling might require whitespace insertion after list bullets. |
231 | // In those scenarios, presence of bullet-wiktext in the text node is okay. |
232 | // Hence the check for /^[#*:;]*$/ above. |
233 | return (bool)strspn( $text, '#*:;', 0, 1 ); |
234 | } else { |
235 | return false; |
236 | } |
237 | } |
238 | |
239 | /** |
240 | * @param Node $thNode |
241 | * @param SerializerState $state |
242 | * @param string $text |
243 | * @param array $opts [ 'node' => Node ] |
244 | * @return bool |
245 | */ |
246 | public function thHandler( |
247 | Node $thNode, SerializerState $state, string $text, array $opts |
248 | ): bool { |
249 | // {| |
250 | // !a<div>!!b</div> |
251 | // !c<div>||d</div> |
252 | // |} |
253 | // |
254 | // The <div> will get split across two <th> tags because |
255 | // the !! and | has higher precedence in the tokenizer. |
256 | // |
257 | // So, no matter where in the DOM subtree of the <th> node |
258 | // that text shows up in, we have to unconditionally escape |
259 | // the !! and | characters. |
260 | // |
261 | // That is, so long as it serializes to the same line as the |
262 | // heading was started. |
263 | return preg_match( '/^\s*!/', $state->currLine->text ) && |
264 | preg_match( '/^[^\n]*!!|\|/', $text ); |
265 | } |
266 | |
267 | public function mediaOptionHandler( SerializerState $state, string $text ): bool { |
268 | return str_contains( $text, '|' ) || preg_match( self::LINKS_ESCAPE_RE, $text ); |
269 | } |
270 | |
271 | public function wikilinkHandler( SerializerState $state, string $text ): bool { |
272 | return (bool)preg_match( self::LINKS_ESCAPE_RE, $text ); |
273 | } |
274 | |
275 | public function aHandler( SerializerState $state, string $text ): bool { |
276 | return str_contains( $text, ']' ); |
277 | } |
278 | |
279 | /** |
280 | * @param Node $tdNode |
281 | * @param bool $inWideTD |
282 | * @param SerializerState $state |
283 | * @param string $text |
284 | * @param array $opts [ 'node' => ?Node ] |
285 | * @return bool |
286 | */ |
287 | public function tdHandler( |
288 | Node $tdNode, bool $inWideTD, SerializerState $state, string $text, array $opts |
289 | ): bool { |
290 | $node = $opts['node'] ?? null; |
291 | /* |
292 | * "|" anywhere in a text node of the <td> subtree can be trouble! |
293 | * It is not sufficient to just look at immediate child of <td> |
294 | * Try parsing the following table: |
295 | * |
296 | * {| |
297 | * |a''b|c'' |
298 | * |} |
299 | * |
300 | * Similarly, "-" or "+" when emitted after a "|" in sol position |
301 | * is trouble, but in addition to showing up as the immediate first |
302 | * child of tdNode, they can appear on the leftmost path from |
303 | * tdNode as long as the path only has nodes don't emit any wikitext. |
304 | * Ex: <td><p>-</p></td>, but not: <td><small>-</small></td> |
305 | */ |
306 | |
307 | // If 'text' is on the same wikitext line as the "|" corresponding |
308 | // to the <td> |
309 | // * | in a td should be escaped |
310 | // * +-} in SOL position (if they show up on the leftmost path with |
311 | // only zero-wt-emitting nodes on that path) |
312 | if ( !$node || $state->currLine->firstNode === $tdNode ) { |
313 | if ( str_contains( $text, '|' ) ) { |
314 | return true; |
315 | } |
316 | if ( !$inWideTD && |
317 | $state->currLine->text === '|' && |
318 | strspn( $text, '-+}', 0, 1 ) && |
319 | $node |
320 | ) { |
321 | $n = $node; |
322 | while ( $n && $n !== $tdNode ) { |
323 | if ( !$this->isFirstContentNode( $n ) || |
324 | !( $n === $node || WTUtils::isZeroWidthWikitextElt( $n ) ) ) { |
325 | return false; |
326 | } |
327 | $n = $n->parentNode; |
328 | } |
329 | return true; |
330 | } |
331 | } |
332 | return false; |
333 | } |
334 | |
335 | /** |
336 | * Tokenize string and pop EOFTk |
337 | * |
338 | * @param string $str |
339 | * @param bool $sol |
340 | * @return array |
341 | */ |
342 | public function tokenizeStr( string $str, bool $sol ): array { |
343 | $tokens = $this->tokenizer->tokenizeSync( $str, [ 'sol' => $sol ] ); |
344 | Assert::invariant( |
345 | array_pop( $tokens ) instanceof EOFTk, |
346 | 'Expected EOF token!' |
347 | ); |
348 | return $tokens; |
349 | } |
350 | |
351 | public function textCanParseAsLink( Node $node, SerializerState $state, string $text ): bool { |
352 | $env = $state->getEnv(); |
353 | $env->trace( |
354 | 'wt-escape', 'link-test-text=', |
355 | static function () use ( $text ) { |
356 | return PHPUtils::jsonEncode( $text ); |
357 | } |
358 | ); |
359 | |
360 | // Strip away extraneous characters after a ]] or a ] |
361 | // They are inessential to the test of whether the ]]/] |
362 | // will get parsed into a wikilink and only complicate |
363 | // the logic (needing to ignore entities, etc.). |
364 | $text = preg_replace( '/\][^\]]*$/D', ']', $text, 1 ); |
365 | |
366 | // text only contains ']' chars. |
367 | // Since we stripped everything after ']' above, if a newline is |
368 | // present, a link would have to straddle newlines which is not valid. |
369 | if ( str_contains( $text, "\n" ) ) { |
370 | return false; |
371 | } |
372 | |
373 | $str = $state->currLine->text . $text; |
374 | $tokens = $this->tokenizeStr( $str, false ); // sol state is irrelevant here |
375 | $n = count( $tokens ); |
376 | $lastToken = $tokens[$n - 1]; |
377 | |
378 | $env->trace( 'wt-escape', 'str=', $str, ';tokens=', $tokens ); |
379 | |
380 | // If 'text' remained outside of any non-string tokens, |
381 | // it does not need nowiking. |
382 | if ( $lastToken === $text || |
383 | ( is_string( $lastToken ) && |
384 | $text === substr( $lastToken, -strlen( $text ) ) |
385 | ) |
386 | ) { |
387 | return false; |
388 | } |
389 | |
390 | // Verify that the tokenized links are valid links |
391 | $buf = ''; |
392 | for ( $i = $n - 1; $i >= 0; $i-- ) { |
393 | $t = $tokens[$i]; |
394 | if ( is_string( $t ) ) { |
395 | $buf = $t . $buf; |
396 | } elseif ( $t->getName() === 'wikilink' ) { |
397 | $target = $t->getAttributeV( 'href' ); |
398 | if ( is_array( $target ) ) { |
399 | // FIXME: Can lead to false negatives. |
400 | // In theory, template expansion *could* make this a link. |
401 | return false; |
402 | } |
403 | if ( $env->isValidLinkTarget( $target ) && |
404 | !$env->getSiteConfig()->hasValidProtocol( $target ) |
405 | ) { |
406 | return true; |
407 | } |
408 | |
409 | // Assumes 'src' will always be present which it seems to be. |
410 | // Tests will fail if anything changes in the tokenizer. |
411 | $buf = $t->dataParsoid->src . $buf; |
412 | } elseif ( $t->getName() === 'extlink' ) { |
413 | // Check if the extlink came from a template which in the end |
414 | // would not really parse as an extlink. |
415 | |
416 | $href = $t->getAttributeV( 'href' ); |
417 | if ( is_array( $href ) ) { |
418 | $href = $href[0]; |
419 | } |
420 | |
421 | if ( !TokenUtils::isTemplateToken( $href ) ) { |
422 | // Not a template and a real href => needs nowiking |
423 | if ( is_string( $href ) && preg_match( '#https?://#', $href ) ) { |
424 | return true; |
425 | } |
426 | } else { |
427 | while ( $node ) { |
428 | $node = DiffDOMUtils::previousNonSepSibling( $node ); |
429 | if ( $node && WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
430 | // FIXME: This is not entirely correct. |
431 | // Assumes that extlink content doesn't have templates. |
432 | // Solution: Count # of non-nested templates encountered |
433 | // and skip over intermediate templates. |
434 | // var content = t.getAttribute('mw:content'); |
435 | // var n = intermediateNonNestedTemplates(content); |
436 | break; |
437 | } |
438 | } |
439 | |
440 | if ( $node instanceof Element && DOMCompat::nodeName( $node ) === 'a' && |
441 | $node->textContent === DOMCompat::getAttribute( $node, 'href' ) |
442 | ) { |
443 | // The template expands to an url link => needs nowiking |
444 | return true; |
445 | } |
446 | } |
447 | |
448 | // Since this will not parse to a real extlink, |
449 | // update buf with the wikitext src for this token. |
450 | $tsr = $t->dataParsoid->tsr; |
451 | $buf = $tsr->substr( $str ) . $buf; |
452 | } else { |
453 | // We have no other smarts => be conservative. |
454 | return true; |
455 | } |
456 | |
457 | if ( $text === substr( $buf, -strlen( $text ) ) ) { |
458 | // 'text' emerged unscathed |
459 | return false; |
460 | } |
461 | } |
462 | |
463 | // We couldn't prove safety of skipping nowiki-ing. |
464 | return true; |
465 | } |
466 | |
467 | private function hasWikitextTokens( |
468 | SerializerState $state, bool $onNewline, string $text |
469 | ): bool { |
470 | $env = $state->getEnv(); |
471 | $env->trace( |
472 | 'wt-escape', 'nl:', $onNewline, ':text=', |
473 | static function () use ( $text ) { |
474 | return PHPUtils::jsonEncode( $text ); |
475 | } |
476 | ); |
477 | |
478 | // tokenize the text |
479 | $sol = $onNewline && !( $state->inIndentPre || $state->inPHPBlock ); |
480 | |
481 | // If we're inside a <pre>, we need to add an extra space after every |
482 | // newline so that the tokenizer correctly parses all tokens in a pre |
483 | // instead of just the first one. See T95794. |
484 | if ( $state->inIndentPre ) { |
485 | $text = str_replace( "\n", "\n ", $text ); |
486 | } |
487 | |
488 | $tokens = $this->tokenizeStr( $text, $sol ); |
489 | |
490 | // If the token stream has a TagTk, SelfclosingTagTk, EndTagTk or CommentTk |
491 | // then this text needs escaping! |
492 | $numEntities = 0; |
493 | foreach ( $tokens as $t ) { |
494 | $env->trace( 'wt-escape', 'T:', $t ); |
495 | |
496 | // Ignore html tags that aren't allowed as literals in wikitext |
497 | if ( TokenUtils::isHTMLTag( $t ) ) { |
498 | if ( |
499 | TokenUtils::matchTypeOf( $t, '#^mw:Extension(/|$)#' ) && |
500 | ( $this->extName !== $t->getAttributeV( 'name' ) ) |
501 | ) { |
502 | return true; |
503 | } |
504 | |
505 | // Always escape isolated extension tags (T59469). Consider this: |
506 | // echo "<ref>foo<p></ref></p>" | node parse --html2wt |
507 | // The <ref> and </ref> tag-like text is spread across the DOM, and in |
508 | // the worst case can be anywhere. So, we conservatively escape these |
509 | // elements always (which can lead to excessive nowiki-escapes in some |
510 | // cases, but is always safe). |
511 | if ( ( $t instanceof TagTk || $t instanceof EndTagTk ) && |
512 | $env->getSiteConfig()->isExtensionTag( mb_strtolower( $t->getName() ) ) |
513 | ) { |
514 | return true; |
515 | } |
516 | |
517 | // If the tag is one that's allowed in wikitext, we need to escape |
518 | // it inside <nowiki>s, because a text node nodeValue always returns |
519 | // non-escaped entities (that is, converts "<h2>" to "<h2>"). |
520 | // TODO: We should also do this for <a> tags because even if they |
521 | // aren't allowed in wikitext and thus don't need to be escaped, the |
522 | // result can be confusing for editors. However, doing it here in a |
523 | // simple way interacts badly with normal link escaping, so it's |
524 | // left for later. |
525 | if ( isset( Consts::$Sanitizer['AllowedLiteralTags'][mb_strtolower( $t->getName() )] ) ) { |
526 | return true; |
527 | } else { |
528 | continue; |
529 | } |
530 | } |
531 | |
532 | if ( $t instanceof SelfclosingTagTk ) { |
533 | // * Ignore RFC/ISBN/PMID tokens when those are encountered in the |
534 | // context of another link's content -- those are not parsed to |
535 | // ext-links in that context. (T109371) |
536 | if ( ( $t->getName() === 'extlink' || $t->getName() === 'wikilink' ) && |
537 | ( $t->dataParsoid->stx ?? null ) === 'magiclink' && |
538 | ( $state->inAttribute || $state->inLink ) ) { |
539 | continue; |
540 | } |
541 | |
542 | // Ignore url links in attributes (href, mostly) |
543 | // since they are not in danger of being autolink-ified there. |
544 | if ( $t->getName() === 'urllink' && ( $state->inAttribute || $state->inLink ) ) { |
545 | continue; |
546 | } |
547 | |
548 | if ( $t->getName() === 'wikilink' ) { |
549 | if ( $env->isValidLinkTarget( $t->getAttributeV( 'href' ) ?? '' ) ) { |
550 | return true; |
551 | } else { |
552 | continue; |
553 | } |
554 | } |
555 | |
556 | return true; |
557 | } |
558 | |
559 | if ( $state->inCaption && $t instanceof TagTk && $t->getName() === 'listItem' ) { |
560 | continue; |
561 | } |
562 | |
563 | if ( $t instanceof TagTk ) { |
564 | // Ignore mw:Entity tokens |
565 | if ( $t->getName() === 'span' && TokenUtils::hasTypeOf( $t, 'mw:Entity' ) ) { |
566 | $numEntities++; |
567 | continue; |
568 | } |
569 | |
570 | // Ignore table tokens outside of tables |
571 | if ( in_array( $t->getName(), [ 'caption', 'td', 'tr', 'th' ], true ) && |
572 | !TokenUtils::isHTMLTag( $t ) && |
573 | $state->wikiTableNesting === 0 |
574 | ) { |
575 | continue; |
576 | } |
577 | |
578 | // Headings have both SOL and EOL requirements. This tokenization |
579 | // here only verifies SOL requirements, not EOL requirements. |
580 | // So, record this information so that we can strip unnecessary |
581 | // nowikis after the fact. |
582 | if ( preg_match( '/^h\d$/D', $t->getName() ) ) { |
583 | $state->hasHeadingEscapes = true; |
584 | } |
585 | |
586 | return true; |
587 | } |
588 | |
589 | if ( $t instanceof EndTagTk ) { |
590 | // Ignore mw:Entity tokens |
591 | if ( $numEntities > 0 && $t->getName() === 'span' ) { |
592 | $numEntities--; |
593 | continue; |
594 | } |
595 | // Ignore heading tokens |
596 | if ( preg_match( '/^h\d$/D', $t->getName() ) ) { |
597 | continue; |
598 | } |
599 | |
600 | // Ignore table tokens outside of tables |
601 | if ( isset( ( [ 'caption' => 1, 'table' => 1 ] )[$t->getName()] ) && |
602 | $state->wikiTableNesting === 0 |
603 | ) { |
604 | continue; |
605 | } |
606 | |
607 | // </br>! |
608 | if ( mb_strtolower( $t->getName() ) === 'br' ) { |
609 | continue; |
610 | } |
611 | |
612 | return true; |
613 | } |
614 | } |
615 | |
616 | return false; |
617 | } |
618 | |
619 | private static function nowikiWrap( |
620 | string $str, bool $close, bool &$inNowiki, bool &$nowikisAdded, string &$buf |
621 | ): void { |
622 | if ( !$inNowiki ) { |
623 | $buf .= '<nowiki>'; |
624 | $inNowiki = true; |
625 | $nowikisAdded = true; |
626 | } |
627 | $buf .= $str; |
628 | if ( $close ) { |
629 | $buf .= '</nowiki>'; |
630 | $inNowiki = false; |
631 | } |
632 | } |
633 | |
634 | /** |
635 | * This function attempts to wrap smallest escapable units into |
636 | * nowikis (which can potentially add multiple nowiki pairs in a |
637 | * single string). The idea here is that since this should all be |
638 | * text, anything that tokenizes to another construct needs to be |
639 | * wrapped. |
640 | * |
641 | * Full-wrapping is enabled if the string is being escaped within |
642 | * context-specific handlers where the tokenization context might |
643 | * be different from what we use in this code. |
644 | * |
645 | * @param SerializerState $state |
646 | * @param bool $sol |
647 | * @param string $origText |
648 | * @param bool $fullWrap |
649 | * @param bool $dontWrapIfUnnecessary |
650 | * @return string |
651 | */ |
652 | public function escapedText( |
653 | SerializerState $state, bool $sol, string $origText, |
654 | bool $fullWrap = false, bool $dontWrapIfUnnecessary = false |
655 | ): string { |
656 | Assert::invariant( |
657 | preg_match( '/^(.*?)((?:\r?\n)*)$/sD', $origText, $match ), |
658 | "Escaped text matching failed: {$origText}" |
659 | ); |
660 | |
661 | $text = $match[1]; |
662 | $nls = $match[2]; |
663 | |
664 | if ( $fullWrap ) { |
665 | return '<nowiki>' . $text . '</nowiki>' . $nls; |
666 | } |
667 | |
668 | $buf = ''; |
669 | $inNowiki = false; |
670 | $nowikisAdded = false; |
671 | // These token types don't come with a closing tag |
672 | $tokensWithoutClosingTag = PHPUtils::makeSet( [ 'listItem', 'td', 'tr' ] ); |
673 | |
674 | // reverse escaping nowiki tags |
675 | // we do this so that they tokenize as nowikis |
676 | // instead of entity enclosed text |
677 | $text = preg_replace( '#<(/?nowiki\s*/?\s*)>#i', '<$1>', $text ); |
678 | |
679 | $tokens = $this->tokenizeStr( $text, $sol ); |
680 | |
681 | foreach ( $tokens as $t ) { |
682 | if ( is_string( $t ) ) { |
683 | if ( strlen( $t ) > 0 ) { |
684 | $t = WTSUtils::escapeNowikiTags( $t ); |
685 | if ( !$inNowiki && ( ( $sol && $t[0] === ' ' ) || str_contains( $t, "\n " ) ) ) { |
686 | $x = preg_split( '/(^|\n) /', $t, -1, PREG_SPLIT_DELIM_CAPTURE ); |
687 | $buf .= $x[0]; |
688 | $lastIndexX = count( $x ) - 1; |
689 | for ( $k = 1; $k < $lastIndexX; $k += 2 ) { |
690 | $buf .= $x[$k]; |
691 | if ( $k !== 1 || $x[$k] === "\n" || $sol ) { |
692 | self::nowikiWrap( ' ', true, $inNowiki, $nowikisAdded, $buf ); |
693 | } else { |
694 | $buf .= ' '; |
695 | } |
696 | $buf .= $x[$k + 1]; |
697 | } |
698 | } else { |
699 | $buf .= $t; |
700 | } |
701 | $sol = false; |
702 | } |
703 | continue; |
704 | } |
705 | |
706 | $tsr = $t->dataParsoid->tsr ?? null; |
707 | if ( !( $tsr instanceof SourceRange ) ) { |
708 | $env = $state->getEnv(); |
709 | $env->log( |
710 | 'error/html2wt/escapeNowiki', |
711 | 'Missing tsr for token ', |
712 | PHPUtils::jsonEncode( $t ), |
713 | 'while processing text ', |
714 | $text |
715 | ); |
716 | |
717 | // Bail and wrap the whole thing in a nowiki |
718 | // if we have missing information. |
719 | // Use match[1] since text has been clobbered above. |
720 | return '<nowiki>' . $match[1] . '</nowiki>' . $nls; |
721 | } |
722 | |
723 | // Now put back the escaping we removed above |
724 | $tSrc = WTSUtils::escapeNowikiTags( $tsr->substr( $text ) ); |
725 | switch ( true ) { |
726 | case $t instanceof NlTk: |
727 | $buf .= $tSrc; |
728 | $sol = true; |
729 | break; |
730 | case $t instanceof CommentTk: |
731 | // Comments are sol-transparent |
732 | $buf .= $tSrc; |
733 | break; |
734 | case $t instanceof TagTk: |
735 | // Treat tokens with missing tags as self-closing tokens |
736 | // for the purpose of minimal nowiki escaping |
737 | self::nowikiWrap( |
738 | $tSrc, |
739 | isset( $tokensWithoutClosingTag[$t->getName()] ), |
740 | $inNowiki, |
741 | $nowikisAdded, |
742 | $buf |
743 | ); |
744 | $sol = false; |
745 | break; |
746 | case $t instanceof EndTagTk: |
747 | self::nowikiWrap( $tSrc, true, $inNowiki, $nowikisAdded, $buf ); |
748 | $sol = false; |
749 | break; |
750 | case $t instanceof SelfclosingTagTk: |
751 | if ( $t->getName() !== 'meta' || |
752 | !TokenUtils::hasTypeOf( $t, 'mw:EmptyLine' ) |
753 | ) { |
754 | // Don't bother with marker or empty-line metas |
755 | self::nowikiWrap( $tSrc, true, $inNowiki, $nowikisAdded, $buf ); |
756 | } |
757 | $sol = false; |
758 | break; |
759 | } |
760 | } |
761 | |
762 | // close any unclosed nowikis |
763 | if ( $inNowiki ) { |
764 | $buf .= '</nowiki>'; |
765 | } |
766 | |
767 | // Make sure nowiki is always added |
768 | // Ex: "foo]]" won't tokenize into tags at all |
769 | if ( !$nowikisAdded && !$dontWrapIfUnnecessary ) { |
770 | $buf = ''; |
771 | self::nowikiWrap( $text, true, $inNowiki, $nowikisAdded, $buf ); |
772 | } |
773 | |
774 | $buf .= $nls; |
775 | return $buf; |
776 | } |
777 | |
778 | /** |
779 | * @param SerializerState $state |
780 | * @param string $text |
781 | * @param array $opts [ 'node' => Node, 'inMultilineMode' => ?bool, 'isLastChild' => ?bool ] |
782 | * @return string |
783 | */ |
784 | public function escapeWikitext( SerializerState $state, string $text, array $opts ): string { |
785 | $env = $state->getEnv(); |
786 | $env->trace( |
787 | 'wt-escape', 'EWT:', |
788 | static function () use ( $text ) { |
789 | return PHPUtils::jsonEncode( $text ); |
790 | } |
791 | ); |
792 | |
793 | /* ----------------------------------------------------------------- |
794 | * General strategy: If a substring requires escaping, we can escape |
795 | * the entire string without further analysis of the rest of the string. |
796 | * ----------------------------------------------------------------- */ |
797 | |
798 | $hasMagicWord = preg_match( '/(^|\W)(RFC|ISBN|PMID)\s/', $text ); |
799 | $hasAutolink = $env->getSiteConfig()->findValidProtocol( $text ); |
800 | $fullCheckNeeded = !$state->inLink && ( $hasMagicWord || $hasAutolink ); |
801 | $hasQuoteChar = false; |
802 | $indentPreUnsafe = false; |
803 | $hasNonQuoteEscapableChars = false; |
804 | $indentPreSafeMode = $state->inIndentPre || $state->inPHPBlock; |
805 | $sol = $state->onSOL && !$indentPreSafeMode; |
806 | |
807 | // Fast path for special protected characters. |
808 | if ( $state->protect && preg_match( $state->protect, $text ) ) { |
809 | return $this->escapedText( $state, $sol, $text ); |
810 | } |
811 | |
812 | if ( !$fullCheckNeeded ) { |
813 | $hasQuoteChar = str_contains( $text, "'" ); |
814 | $indentPreUnsafe = !$indentPreSafeMode && ( |
815 | preg_match( '/\n +[^\r\n]*?\S+/', $text ) || |
816 | ( $sol && preg_match( '/^ +[^\r\n]*?\S+/', $text ) ) |
817 | ); |
818 | $hasNonQuoteEscapableChars = preg_match( '/[<>\[\]\-\+\|!=#\*:;~{}]|__[^_]*__/', $text ); |
819 | $hasLanguageConverter = preg_match( '/-\{|\}-/', $text ); |
820 | if ( $hasLanguageConverter ) { |
821 | $fullCheckNeeded = true; |
822 | } |
823 | } |
824 | |
825 | // Quick check for the common case (useful to kill a majority of requests) |
826 | // |
827 | // Pure white-space or text without wt-special chars need not be analyzed |
828 | if ( !$fullCheckNeeded && !$hasQuoteChar && !$indentPreUnsafe && !$hasNonQuoteEscapableChars ) { |
829 | $env->trace( 'wt-escape', '---No-checks needed---' ); |
830 | return $text; |
831 | } |
832 | |
833 | // Context-specific escape handler |
834 | $wteHandler = PHPUtils::lastItem( $state->wteHandlerStack ); |
835 | if ( $wteHandler && $wteHandler( $state, $text, $opts ) ) { |
836 | $env->trace( 'wt-escape', '---Context-specific escape handler---' ); |
837 | return $this->escapedText( $state, false, $text, true ); |
838 | } |
839 | |
840 | // Quote-escape test |
841 | if ( str_contains( $text, "''" ) || |
842 | self::hasLeadingEscapableQuoteChar( $text, $opts ) || |
843 | self::hasTrailingEscapableQuoteChar( $text, $opts ) |
844 | ) { |
845 | // Check if we need full-wrapping <nowiki>..</nowiki> |
846 | // or selective <nowiki/> escaping for quotes. |
847 | if ( $fullCheckNeeded || |
848 | $indentPreUnsafe || |
849 | ( $hasNonQuoteEscapableChars && |
850 | $this->hasWikitextTokens( $state, $sol, $text ) |
851 | ) |
852 | ) { |
853 | $env->trace( 'wt-escape', '---quotes: escaping text---' ); |
854 | // If the reason for full wrap is that the text contains non-quote |
855 | // escapable chars, it's still possible to minimize the contents |
856 | // of the <nowiki> (T71950). |
857 | return $this->escapedText( $state, $sol, $text ); |
858 | } else { |
859 | $quoteEscapedText = self::escapedIBSiblingNodeText( $state, $text, $opts ); |
860 | if ( $quoteEscapedText ) { |
861 | $env->trace( 'wt-escape', '---sibling of i/b tag---' ); |
862 | return $quoteEscapedText; |
863 | } |
864 | } |
865 | } |
866 | |
867 | // Template and template-arg markers are escaped unconditionally! |
868 | // Conditional escaping requires matching brace pairs and knowledge |
869 | // of whether we are in template arg context or not. |
870 | if ( preg_match( '/\{\{\{|\{\{|\}\}\}|\}\}/', $text ) ) { |
871 | $env->trace( 'wt-escape', '---Unconditional: transclusion chars---' ); |
872 | return $this->escapedText( $state, false, $text ); |
873 | } |
874 | |
875 | // Once we eliminate the possibility of multi-line tokens, split the text |
876 | // around newlines and escape each line separately. |
877 | if ( preg_match( '/\n./', $text ) ) { |
878 | $env->trace( 'wt-escape', '-- <multi-line-escaping-mode> --' ); |
879 | // We've already processed the full string in a context-specific handler. |
880 | // No more additional processing required. So, push/pop a null handler. |
881 | $state->wteHandlerStack[] = null; |
882 | |
883 | $tmp = []; |
884 | foreach ( explode( "\n", $text ) as $i => $line ) { |
885 | if ( $i > 0 ) { |
886 | // Update state |
887 | $state->onSOL = true; |
888 | $state->currLine->text = ''; |
889 | $opts['inMultilineMode'] = true; |
890 | } |
891 | $tmp[] = $this->escapeWikitext( $state, $line, $opts ); |
892 | } |
893 | $ret = implode( "\n", $tmp ); |
894 | |
895 | array_pop( $state->wteHandlerStack ); |
896 | |
897 | // If nothing changed, check if the original multiline string has |
898 | // any wikitext tokens (ex: multi-line html tags <div\n>foo</div\n>). |
899 | if ( $ret === $text && $this->hasWikitextTokens( $state, $sol, $text ) ) { |
900 | $env->trace( 'wt-escape', '---Found multi-line wt tokens---' ); |
901 | $ret = $this->escapedText( $state, $sol, $text ); |
902 | } |
903 | |
904 | $env->trace( 'wt-escape', '-- </multi-line-escaping-mode> --' ); |
905 | return $ret; |
906 | } |
907 | |
908 | $env->trace( |
909 | 'wt-escape', 'SOL:', $sol, |
910 | static function () use ( $text ) { |
911 | return PHPUtils::jsonEncode( $text ); |
912 | } |
913 | ); |
914 | |
915 | $hasTildes = preg_match( '/~{3,5}/', $text ); |
916 | if ( !$fullCheckNeeded && !$hasTildes ) { |
917 | // {{, {{{, }}}, }} are handled above. |
918 | // Test 1: '', [], <>, __FOO__ need escaping wherever they occur |
919 | // = needs escaping in end-of-line context |
920 | // Test 2: {|, |}, ||, |-, |+, , *#:;, ----, =*= need escaping only in SOL context. |
921 | if ( !$sol && !preg_match( "/''|[<>]|\\[.*\\]|\\]|(=[ ]*(\\n|$))|__[^_]*__/", $text ) ) { |
922 | // It is not necessary to test for an unmatched opening bracket ([) |
923 | // as long as we always escape an unmatched closing bracket (]). |
924 | $env->trace( 'wt-escape', '---Not-SOL and safe---' ); |
925 | return $text; |
926 | } |
927 | |
928 | // Quick checks when on a newline |
929 | // + can only occur as "|+" and - can only occur as "|-" or ---- |
930 | if ( $sol && !preg_match( '/(^|\n)[ #*:;=]|[<\[\]>\|\'!]|\-\-\-\-|__[^_]*__/', $text ) ) { |
931 | $env->trace( 'wt-escape', '---SOL and safe---' ); |
932 | return $text; |
933 | } |
934 | } |
935 | |
936 | // The front-end parser eliminated pre-tokens in the tokenizer |
937 | // and moved them to a stream handler. So, we always conservatively |
938 | // escape text with ' ' in sol posn with one caveat: |
939 | // * and when the current line has block tokens |
940 | if ( $indentPreUnsafe && |
941 | ( !self::hasBlocksOnLine( $state->currLine->firstNode, true ) || |
942 | !empty( $opts['inMultilineMode'] ) |
943 | ) |
944 | ) { |
945 | $env->trace( 'wt-escape', '---SOL and pre---' ); |
946 | $state->hasIndentPreNowikis = true; |
947 | return $this->escapedText( $state, $sol, $text ); |
948 | } |
949 | |
950 | // escape nowiki tags |
951 | $text = WTSUtils::escapeNowikiTags( $text ); |
952 | |
953 | // Use the tokenizer to see if we have any wikitext tokens |
954 | // |
955 | // Ignores entities |
956 | if ( $hasTildes ) { |
957 | $env->trace( 'wt-escape', '---Found tildes---' ); |
958 | return $this->escapedText( $state, $sol, $text ); |
959 | } elseif ( $this->hasWikitextTokens( $state, $sol, $text ) ) { |
960 | $env->trace( 'wt-escape', '---Found WT tokens---' ); |
961 | return $this->escapedText( $state, $sol, $text ); |
962 | } elseif ( preg_match( '/[^\[]*\]/', $text ) && |
963 | $this->textCanParseAsLink( $opts['node'], $state, $text ) |
964 | ) { |
965 | // we have an closing bracket, and |
966 | // - the text will get parsed as a link in |
967 | $env->trace( 'wt-escape', '---Links: complex single-line test---' ); |
968 | return $this->escapedText( $state, $sol, $text ); |
969 | } elseif ( !empty( $opts['isLastChild'] ) && substr( $text, -1 ) === '=' ) { |
970 | // 1. we have an open heading char, and |
971 | // - text ends in a '=' |
972 | // - text comes from the last child |
973 | preg_match( '/^h(\d)/', DOMCompat::nodeName( $state->currLine->firstNode ), $headingMatch ); |
974 | if ( $headingMatch ) { |
975 | $n = intval( $headingMatch[1] ); |
976 | if ( ( $state->currLine->text . $text )[$n] === '=' ) { |
977 | // The first character after the heading wikitext is/will be a '='. |
978 | // So, the trailing '=' can change semantics if it is not nowikied. |
979 | $env->trace( 'wt-escape', '---Heading: complex single-line test---' ); |
980 | return $this->escapedText( $state, $sol, $text ); |
981 | } else { |
982 | return $text; |
983 | } |
984 | } elseif ( strlen( $state->currLine->text ) > 0 && $state->currLine->text[0] === '=' ) { |
985 | $env->trace( 'wt-escape', '---Text-as-heading: complex single-line test---' ); |
986 | return $this->escapedText( $state, $sol, $text ); |
987 | } else { |
988 | return $text; |
989 | } |
990 | } else { |
991 | $env->trace( 'wt-escape', '---All good!---' ); |
992 | return $text; |
993 | } |
994 | } |
995 | |
996 | /** |
997 | * @param string $str |
998 | * @param bool $isLast |
999 | * @param bool $checkNowiki |
1000 | * @param string &$buf |
1001 | * @param bool &$openNowiki |
1002 | * @param bool $isTemplate |
1003 | * @param bool &$serializeAsNamed |
1004 | * @param array $opts [ 'numPositionalArgs' => int, 'argPositionalIndex' => int, 'type' => string, |
1005 | * 'numArgs' => int, 'argIndex' => int ] |
1006 | */ |
1007 | private static function appendStr( |
1008 | string $str, bool $isLast, bool $checkNowiki, string &$buf, bool &$openNowiki, |
1009 | bool $isTemplate, bool &$serializeAsNamed, array $opts |
1010 | ): void { |
1011 | if ( !$checkNowiki ) { |
1012 | if ( $openNowiki ) { |
1013 | $buf .= '</nowiki>'; |
1014 | $openNowiki = false; |
1015 | } |
1016 | $buf .= $str; |
1017 | return; |
1018 | } |
1019 | |
1020 | // '=' is not allowed in positional parameters. We can either |
1021 | // nowiki escape it or convert the named parameter into a |
1022 | // positional param to avoid the escaping. |
1023 | if ( $isTemplate && !$serializeAsNamed && str_contains( $str, '=' ) ) { |
1024 | // In certain situations, it is better to add a nowiki escape |
1025 | // rather than convert this to a named param. |
1026 | // |
1027 | // Ex: Consider: {{funky-tpl|a|b|c|d|e|f|g|h}} |
1028 | // |
1029 | // If an editor changes 'a' to 'f=oo' and we convert it to |
1030 | // a named param 1=f=oo, we are effectively converting all |
1031 | // the later params into named params as well and we get |
1032 | // {{funky-tpl|1=f=oo|2=b|3=c|...|8=h}} instead of |
1033 | // {{funky-tpl|<nowiki>f=oo</nowiki>|b|c|...|h}} |
1034 | // |
1035 | // The latter is better in this case. This is a real problem |
1036 | // in production. |
1037 | // |
1038 | // For now, we use a simple heuristic below and can be |
1039 | // refined later, if necessary |
1040 | // |
1041 | // 1. Either there were no original positional args |
1042 | // 2. Or, only the last positional arg uses '=' |
1043 | if ( $opts['numPositionalArgs'] === 0 || |
1044 | $opts['numPositionalArgs'] === $opts['argPositionalIndex'] |
1045 | ) { |
1046 | $serializeAsNamed = true; |
1047 | } |
1048 | } |
1049 | |
1050 | // Count how many reasons for nowiki |
1051 | $needNowikiCount = 0; |
1052 | $neededSubstitution = null; |
1053 | // Protect against unmatched pairs of braces and brackets, as they |
1054 | // should never appear in template arguments. |
1055 | $bracketPairStrippedStr = preg_replace( |
1056 | '/\[\[([^\[\]]*)\]\]|\{\{([^\{\}]*)\}\}|-\{([^\{\}]*)\}-/', |
1057 | '_$1_', |
1058 | $str |
1059 | ); |
1060 | if ( preg_match( '/\{\{|\}\}|\[\[|\]\]|-\{/', $bracketPairStrippedStr ) ) { |
1061 | $needNowikiCount++; |
1062 | } |
1063 | if ( $opts['type'] !== 'templatearg' && !$serializeAsNamed && str_contains( $str, '=' ) ) { |
1064 | $needNowikiCount++; |
1065 | } |
1066 | if ( $opts['argIndex'] === $opts['numArgs'] && $isLast && str_ends_with( $str, '}' ) ) { |
1067 | // If this is the last part of the last argument, we need to protect |
1068 | // against an ending }, as it would get confused with the template ending }}. |
1069 | $needNowikiCount++; |
1070 | $neededSubstitution = [ '/(\})$/D', '<nowiki>}</nowiki>' ]; |
1071 | } |
1072 | if ( str_contains( $str, '|' ) ) { |
1073 | // If there's an unprotected |, guard it so it doesn't get confused |
1074 | // with the beginning of a different parameter. |
1075 | $needNowikiCount++; |
1076 | $neededSubstitution = [ '/\|/', '{{!}}' ]; |
1077 | } |
1078 | |
1079 | // Now, if arent' already in a <nowiki> and there's only one reason to |
1080 | // protect, avoid guarding too much text by just substituting. |
1081 | if ( !$openNowiki && $needNowikiCount === 1 && $neededSubstitution ) { |
1082 | $str = preg_replace( $neededSubstitution[0], $neededSubstitution[1], $str ); |
1083 | $needNowikiCount = false; |
1084 | } |
1085 | if ( !$openNowiki && $needNowikiCount ) { |
1086 | $buf .= '<nowiki>'; |
1087 | $openNowiki = true; |
1088 | } |
1089 | if ( !$needNowikiCount && $openNowiki ) { |
1090 | $buf .= '</nowiki>'; |
1091 | $openNowiki = false; |
1092 | } |
1093 | $buf .= $str; |
1094 | } |
1095 | |
1096 | /** |
1097 | * General strategy: |
1098 | * |
1099 | * Tokenize the arg wikitext. Anything that parses as tags |
1100 | * are good and we need not bother with those. Check for harmful |
1101 | * characters `[[]]{{}}` or additionally `=` in positional parameters and escape |
1102 | * those fragments since these characters could change semantics of the entire |
1103 | * template transclusion. |
1104 | * |
1105 | * This function makes a couple of assumptions: |
1106 | * |
1107 | * 1. The tokenizer sets tsr on all non-string tokens. |
1108 | * 2. The tsr on TagTk and EndTagTk corresponds to the |
1109 | * width of the opening and closing wikitext tags and not |
1110 | * the entire DOM range they span in the end. |
1111 | * |
1112 | * @param string $arg |
1113 | * @param array $opts [ 'serializeAsNamed' => bool, 'numPositionalArgs' => int, |
1114 | * 'argPositionalIndex' => int, 'type' => string, 'numArgs' => int, 'argIndex' => int ] |
1115 | * @return array |
1116 | */ |
1117 | public function escapeTplArgWT( string $arg, array $opts ): array { |
1118 | $env = $this->env; |
1119 | $serializeAsNamed = $opts['serializeAsNamed']; |
1120 | $buf = ''; |
1121 | $openNowiki = false; |
1122 | $isTemplate = $opts['type'] === 'template'; |
1123 | |
1124 | $tokens = $this->tokenizeStr( $arg, false ); |
1125 | |
1126 | for ( $i = 0, $n = count( $tokens ); $i < $n; $i++ ) { |
1127 | $t = $tokens[$i]; |
1128 | $last = $i === $n - 1; |
1129 | |
1130 | // For mw:Entity spans, the opening and closing tags have 0 width |
1131 | // and the enclosed content is the decoded entity. Hence the |
1132 | // special case to serialize back the entity's source. |
1133 | if ( $t instanceof TagTk ) { |
1134 | $da = $t->dataParsoid; |
1135 | if ( TokenUtils::matchTypeOf( $t, '#^mw:(Placeholder|Entity)(/|$)#' ) ) { |
1136 | $i += 2; |
1137 | $width = $tokens[$i]->dataParsoid->tsr->end - $da->tsr->start; |
1138 | self::appendStr( |
1139 | substr( $arg, $da->tsr->start, $width ), |
1140 | $last, |
1141 | false, |
1142 | $buf, |
1143 | $openNowiki, |
1144 | $isTemplate, |
1145 | $serializeAsNamed, |
1146 | $opts |
1147 | ); |
1148 | continue; |
1149 | } elseif ( TokenUtils::hasTypeOf( $t, 'mw:Nowiki' ) ) { |
1150 | $i++; |
1151 | while ( $i < $n && |
1152 | ( !$tokens[$i] instanceof EndTagTk || |
1153 | !TokenUtils::hasTypeOf( $tokens[$i], 'mw:Nowiki' ) |
1154 | ) |
1155 | ) { |
1156 | $i++; |
1157 | } |
1158 | if ( $i < $n ) { |
1159 | // After tokenization, we can get here: |
1160 | // * Text explicitly protected by <nowiki> in the parameter. |
1161 | // * Other things that should be protected but weren't |
1162 | // according to the tokenizer. |
1163 | // In template argument, we only need to check for unmatched |
1164 | // braces and brackets pairs (which is done in appendStr), |
1165 | // but only if they weren't explicitly protected in the |
1166 | // passed wikitext. |
1167 | $width = $tokens[$i]->dataParsoid->tsr->end - $da->tsr->start; |
1168 | $substr = substr( $arg, $da->tsr->start, $width ); |
1169 | self::appendStr( |
1170 | $substr, |
1171 | $last, |
1172 | !preg_match( '#<nowiki>[^<]*</nowiki>#', $substr ), |
1173 | $buf, |
1174 | $openNowiki, |
1175 | $isTemplate, |
1176 | $serializeAsNamed, |
1177 | $opts |
1178 | ); |
1179 | } |
1180 | continue; |
1181 | } |
1182 | } |
1183 | |
1184 | if ( is_string( $t ) ) { |
1185 | self::appendStr( |
1186 | $t, |
1187 | $last, |
1188 | true, |
1189 | $buf, |
1190 | $openNowiki, |
1191 | $isTemplate, |
1192 | $serializeAsNamed, |
1193 | $opts |
1194 | ); |
1195 | continue; |
1196 | } |
1197 | |
1198 | switch ( true ) { |
1199 | case $t instanceof TagTk: |
1200 | case $t instanceof EndTagTk: |
1201 | case $t instanceof NlTk: |
1202 | case $t instanceof CommentTk: |
1203 | $da = $t->dataParsoid; |
1204 | if ( empty( $da->tsr ) ) { |
1205 | $errors = [ 'Missing tsr for: ' . PHPUtils::jsonEncode( $t ) ]; |
1206 | $errors[] = 'Arg : ' . PHPUtils::jsonEncode( $arg ); |
1207 | $errors[] = 'Toks: ' . PHPUtils::jsonEncode( $tokens ); |
1208 | $env->log( 'error/html2wt/wtescape', implode( "\n", $errors ) ); |
1209 | // FIXME $da->tsr will be undefined below. |
1210 | // Should we throw an explicit exception here? |
1211 | } |
1212 | self::appendStr( |
1213 | $da->tsr->substr( $arg ), |
1214 | $last, |
1215 | false, |
1216 | $buf, |
1217 | $openNowiki, |
1218 | $isTemplate, |
1219 | $serializeAsNamed, |
1220 | $opts |
1221 | ); |
1222 | break; |
1223 | case $t instanceof SelfclosingTagTk: |
1224 | $da = $t->dataParsoid; |
1225 | if ( empty( $da->tsr ) ) { |
1226 | $errors = [ 'Missing tsr for: ' . PHPUtils::jsonEncode( $t ) ]; |
1227 | $errors[] = 'Arg : ' . PHPUtils::jsonEncode( $arg ); |
1228 | $errors[] = 'Toks: ' . PHPUtils::jsonEncode( $tokens ); |
1229 | $env->log( 'error/html2wt/wtescape', implode( "\n", $errors ) ); |
1230 | // FIXME $da->tsr will be undefined below. |
1231 | // Should we throw an explicit exception here? |
1232 | } |
1233 | $tkSrc = $da->tsr->substr( $arg ); |
1234 | // Replace pipe by an entity. This is not completely safe. |
1235 | if ( $t->getName() === 'extlink' || $t->getName() === 'urllink' ) { |
1236 | $tkBits = $this->tokenizer->tokenizeSync( $tkSrc, [ |
1237 | 'startRule' => 'tplarg_or_template_or_bust', |
1238 | 'sol' => true, |
1239 | ] ); |
1240 | foreach ( $tkBits as $bit ) { |
1241 | if ( $bit instanceof Token ) { |
1242 | self::appendStr( |
1243 | $bit->dataParsoid->src, |
1244 | $last, |
1245 | false, |
1246 | $buf, |
1247 | $openNowiki, |
1248 | $isTemplate, |
1249 | $serializeAsNamed, |
1250 | $opts |
1251 | ); |
1252 | } else { |
1253 | // Convert to a named param w/ the same reasoning |
1254 | // as above for escapeStr, however, here we replace |
1255 | // with an entity to avoid breaking up querystrings |
1256 | // with nowikis. |
1257 | if ( $isTemplate && !$serializeAsNamed && str_contains( $bit, '=' ) ) { |
1258 | if ( $opts['numPositionalArgs'] === 0 |
1259 | || $opts['numPositionalArgs'] === $opts['argIndex'] |
1260 | ) { |
1261 | $serializeAsNamed = true; |
1262 | } else { |
1263 | $bit = str_replace( '=', '=', $bit ); |
1264 | } |
1265 | } |
1266 | $buf .= str_replace( '|', '|', $bit ); |
1267 | } |
1268 | } |
1269 | } else { |
1270 | self::appendStr( |
1271 | $tkSrc, |
1272 | $last, |
1273 | false, |
1274 | $buf, |
1275 | $openNowiki, |
1276 | $isTemplate, |
1277 | $serializeAsNamed, |
1278 | $opts |
1279 | ); |
1280 | } |
1281 | break; |
1282 | case $t instanceof EOFTk: |
1283 | break; |
1284 | } |
1285 | } |
1286 | |
1287 | // If nowiki still open, close it now. |
1288 | if ( $openNowiki ) { |
1289 | $buf .= '</nowiki>'; |
1290 | } |
1291 | |
1292 | return [ 'serializeAsNamed' => $serializeAsNamed, 'v' => $buf ]; |
1293 | } |
1294 | |
1295 | /** |
1296 | * See also `escapeLinkTarget` in LinkHandler.php |
1297 | * |
1298 | * @param SerializerState $state |
1299 | * @param string $str |
1300 | * @param bool $solState |
1301 | * @param Node $node |
1302 | * @param bool $isMedia |
1303 | * @return string |
1304 | */ |
1305 | public function escapeLinkContent( |
1306 | SerializerState $state, string $str, bool $solState, Node $node, bool $isMedia |
1307 | ): string { |
1308 | // Entity-escape the content. |
1309 | $str = Utils::escapeWtEntities( $str ); |
1310 | |
1311 | // Wikitext-escape content. |
1312 | $state->onSOL = $solState; |
1313 | $state->wteHandlerStack[] = $isMedia |
1314 | ? [ $this, 'mediaOptionHandler' ] |
1315 | : [ $this, 'wikilinkHandler' ]; |
1316 | $state->inLink = true; |
1317 | $res = $this->escapeWikitext( $state, $str, [ 'node' => $node ] ); |
1318 | $state->inLink = false; |
1319 | array_pop( $state->wteHandlerStack ); |
1320 | |
1321 | return $res; |
1322 | } |
1323 | } |