Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 682 |
|
0.00% |
0 / 33 |
CRAP | |
0.00% |
0 / 1 |
WikitextSerializer | |
0.00% |
0 / 682 |
|
0.00% |
0 / 33 |
77562 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
linkHandler | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
languageVariantHandler | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
escapeWikitext | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
domToWikitext | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
htmlToWikitext | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
getAttributeKey | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
20 | |||
getAttributeValue | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
30 | |||
getAttributeValueAsShadowInfo | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
6 | |||
serializedImageAttrVal | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
serializedAttrVal | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
tagNeedsEscaping | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
wrapAngleBracket | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
serializeHTMLTag | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
72 | |||
serializeHTMLEndTag | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
42 | |||
serializeAttributes | |
0.00% |
0 / 56 |
|
0.00% |
0 / 1 |
992 | |||
handleLIHackIfApplicable | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
42 | |||
formatStringSubst | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
20 | |||
createParamComparator | |
0.00% |
0 / 56 |
|
0.00% |
0 / 1 |
342 | |||
serializePart | |
0.00% |
0 / 121 |
|
0.00% |
0 / 1 |
1892 | |||
serializeFromParts | |
0.00% |
0 / 33 |
|
0.00% |
0 / 1 |
132 | |||
serializeExtensionStartTag | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
30 | |||
defaultExtensionHandler | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
20 | |||
serializeText | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
42 | |||
serializeTextNode | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
emitWikitext | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
serializeNodeInternal | |
0.00% |
0 / 63 |
|
0.00% |
0 / 1 |
870 | |||
serializeNode | |
0.00% |
0 / 62 |
|
0.00% |
0 / 1 |
306 | |||
stripUnnecessaryHeadingNowikis | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
stripUnnecessaryIndentPreNowikis | |
0.00% |
0 / 32 |
|
0.00% |
0 / 1 |
72 | |||
stripUnnecessaryQuoteNowikis | |
0.00% |
0 / 63 |
|
0.00% |
0 / 1 |
1482 | |||
serializeDOM | |
0.00% |
0 / 30 |
|
0.00% |
0 / 1 |
156 | |||
trace | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Closure; |
7 | use Exception; |
8 | use Wikimedia\Assert\Assert; |
9 | use Wikimedia\Parsoid\Config\Env; |
10 | use Wikimedia\Parsoid\Core\InternalException; |
11 | use Wikimedia\Parsoid\DOM\Comment; |
12 | use Wikimedia\Parsoid\DOM\Document; |
13 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
14 | use Wikimedia\Parsoid\DOM\Element; |
15 | use Wikimedia\Parsoid\DOM\Node; |
16 | use Wikimedia\Parsoid\DOM\Text; |
17 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ConstrainedText; |
18 | use Wikimedia\Parsoid\Html2Wt\DOMHandlers\DOMHandler; |
19 | use Wikimedia\Parsoid\Html2Wt\DOMHandlers\DOMHandlerFactory; |
20 | use Wikimedia\Parsoid\NodeData\ParamInfo; |
21 | use Wikimedia\Parsoid\NodeData\TemplateInfo; |
22 | use Wikimedia\Parsoid\Tokens\KV; |
23 | use Wikimedia\Parsoid\Tokens\TagTk; |
24 | use Wikimedia\Parsoid\Tokens\Token; |
25 | use Wikimedia\Parsoid\Utils\ContentUtils; |
26 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
27 | use Wikimedia\Parsoid\Utils\DOMCompat; |
28 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
29 | use Wikimedia\Parsoid\Utils\DOMUtils; |
30 | use Wikimedia\Parsoid\Utils\PHPUtils; |
31 | use Wikimedia\Parsoid\Utils\Title; |
32 | use Wikimedia\Parsoid\Utils\TokenUtils; |
33 | use Wikimedia\Parsoid\Utils\Utils; |
34 | use Wikimedia\Parsoid\Utils\WTUtils; |
35 | use Wikimedia\Parsoid\Wikitext\Consts; |
36 | |
37 | /** |
38 | * Wikitext to HTML serializer. |
39 | * Serializes a chunk of tokens or an HTML DOM to MediaWiki's wikitext flavor. |
40 | * |
41 | * This serializer is designed to eventually |
42 | * - accept arbitrary HTML and |
43 | * - serialize that to wikitext in a way that round-trips back to the same |
44 | * HTML DOM as far as possible within the limitations of wikitext. |
45 | * |
46 | * Not much effort has been invested so far on supporting |
47 | * non-Parsoid/VE-generated HTML. Some of this involves adaptively switching |
48 | * between wikitext and HTML representations based on the values of attributes |
49 | * and DOM context. A few special cases are already handled adaptively |
50 | * (multi-paragraph list item contents are serialized as HTML tags for |
51 | * example, generic A elements are serialized to HTML A tags), but in general |
52 | * support for this is mostly missing. |
53 | * |
54 | * Example issue: |
55 | * ``` |
56 | * <h1><p>foo</p></h1> will serialize to =\nfoo\n= whereas the |
57 | * correct serialized output would be: =<p>foo</p>= |
58 | * ``` |
59 | * |
60 | * What to do about this? |
61 | * - add a generic 'can this HTML node be serialized to wikitext in this |
62 | * context' detection method and use that to adaptively switch between |
63 | * wikitext and HTML serialization. |
64 | * |
65 | */ |
66 | class WikitextSerializer { |
67 | |
68 | /** @var string[] */ |
69 | private const IGNORED_ATTRIBUTES = [ |
70 | 'data-parsoid' => true, |
71 | 'data-ve-changed' => true, |
72 | 'data-parsoid-changed' => true, |
73 | 'data-parsoid-diff' => true, |
74 | 'data-parsoid-serialize' => true, |
75 | DOMDataUtils::DATA_OBJECT_ATTR_NAME => true, |
76 | ]; |
77 | |
78 | /** @var string[] attribute name => value regexp */ |
79 | private const PARSOID_ATTRIBUTES = [ |
80 | 'about' => '/^#mwt\d+$/D', |
81 | 'typeof' => '/(^|\s)mw:\S+/', |
82 | ]; |
83 | |
84 | /** @var string Regexp */ |
85 | private const TRAILING_COMMENT_OR_WS_AFTER_NL_REGEXP |
86 | = '/\n(\s|' . Utils::COMMENT_REGEXP_FRAGMENT . ')*$/D'; |
87 | |
88 | /** @var string Regexp */ |
89 | private const FORMATSTRING_REGEXP = |
90 | '/^(\n)?(\{\{ *_+)(\n? *\|\n? *_+ *= *)(_+)(\n? *\}\})(\n)?$/D'; |
91 | |
92 | /** @var string Regexp for testing whether nowiki added around heading-like wikitext is needed */ |
93 | private const COMMENT_OR_WS_REGEXP = '/^(\s|' . Utils::COMMENT_REGEXP_FRAGMENT . ')*$/D'; |
94 | |
95 | /** @var string Regexp for testing whether nowiki added around heading-like wikitext is needed */ |
96 | private const HEADING_NOWIKI_REGEXP = '/^(?:' . Utils::COMMENT_REGEXP_FRAGMENT . ')*' |
97 | . '<nowiki>(=+[^=]+=+)<\/nowiki>(.+)$/D'; |
98 | |
99 | /** @var array string[] */ |
100 | private static $separatorREs = [ |
101 | 'pureSepRE' => '/^[ \t\r\n]*$/D', |
102 | 'sepPrefixWithNlsRE' => '/^[ \t]*\n+[ \t\r\n]*/', |
103 | 'sepSuffixWithNlsRE' => '/\n[ \t\r\n]*$/D', |
104 | ]; |
105 | |
106 | /** @var WikitextEscapeHandlers */ |
107 | public $wteHandlers; |
108 | |
109 | /** @var Env */ |
110 | public $env; |
111 | |
112 | /** @var SerializerState */ |
113 | private $state; |
114 | |
115 | /** @var string Log type for trace() */ |
116 | private $logType; |
117 | |
118 | /** |
119 | * @param Env $env |
120 | * @param array $options List of options for serialization: |
121 | * - logType: (string) |
122 | * - extName: (string) |
123 | */ |
124 | public function __construct( Env $env, $options ) { |
125 | $this->env = $env; |
126 | $this->logType = $options['logType'] ?? 'trace/wts'; |
127 | $this->state = new SerializerState( $this, $options ); |
128 | $this->wteHandlers = new WikitextEscapeHandlers( $env, $options['extName'] ?? null ); |
129 | } |
130 | |
131 | /** |
132 | * Main link handler. |
133 | * @param Element $node |
134 | * Used in multiple tag handlers (<a> and <link>), and hence added as top-level method |
135 | */ |
136 | public function linkHandler( Element $node ): void { |
137 | LinkHandlerUtils::linkHandler( $this->state, $node ); |
138 | } |
139 | |
140 | /** |
141 | * @param Element $node |
142 | */ |
143 | public function languageVariantHandler( Node $node ): void { |
144 | LanguageVariantHandler::handleLanguageVariant( $this->state, $node ); |
145 | } |
146 | |
147 | /** |
148 | * Escape wikitext-like strings in '$text' so that $text renders as a plain string |
149 | * when rendered as HTML. The escaping is done based on the context in which $text |
150 | * is present (ex: start-of-line, in a link, etc.) |
151 | * |
152 | * @param SerializerState $state |
153 | * @param string $text |
154 | * @param array $opts |
155 | * - node: (Node) |
156 | * - isLastChild: (bool) |
157 | * @return string |
158 | */ |
159 | public function escapeWikitext( SerializerState $state, string $text, array $opts ): string { |
160 | return $this->wteHandlers->escapeWikitext( $state, $text, $opts ); |
161 | } |
162 | |
163 | public function domToWikitext( |
164 | array $opts, DocumentFragment $node |
165 | ): string { |
166 | $opts['logType'] = $this->logType; |
167 | $serializer = new WikitextSerializer( $this->env, $opts ); |
168 | return $serializer->serializeDOM( $node ); |
169 | } |
170 | |
171 | public function htmlToWikitext( array $opts, string $html ): string { |
172 | $domFragment = ContentUtils::createAndLoadDocumentFragment( |
173 | $this->env->getTopLevelDoc(), $html, [ 'markNew' => true ] |
174 | ); |
175 | return $this->domToWikitext( $opts, $domFragment ); |
176 | } |
177 | |
178 | public function getAttributeKey( Element $node, string $key ): string { |
179 | $tplAttrs = DOMDataUtils::getDataMw( $node )->attribs ?? []; |
180 | foreach ( $tplAttrs as $attr ) { |
181 | // If this attribute's key is generated content, |
182 | // serialize HTML back to generator wikitext. |
183 | if ( ( $attr->key['txt'] ?? null ) === $key && isset( $attr->key['html'] ) ) { |
184 | return $this->htmlToWikitext( [ |
185 | 'env' => $this->env, |
186 | 'onSOL' => false, |
187 | ], $attr->key['html'] ); |
188 | } |
189 | } |
190 | return $key; |
191 | } |
192 | |
193 | /** |
194 | * @param Element $node |
195 | * @param string $key Attribute name. |
196 | * @return ?string The wikitext value, or null if the attribute is not present. |
197 | */ |
198 | public function getAttributeValue( Element $node, string $key ): ?string { |
199 | $tplAttrs = DOMDataUtils::getDataMw( $node )->attribs ?? []; |
200 | foreach ( $tplAttrs as $attr ) { |
201 | // If this attribute's value is generated content, |
202 | // serialize HTML back to generator wikitext. |
203 | // PORT-FIXME: not type safe. Need documentation on attrib format. |
204 | if ( ( $attr->key === $key || ( $attr->key['txt'] ?? null ) === $key ) |
205 | // Only return here if the value is generated (ie. .html), |
206 | // it may just be in .txt form. |
207 | // html:"" will serialize to "" and |
208 | // will be returned here. This is used to suppress the =".." |
209 | // string in the attribute in scenarios where the template |
210 | // generates a "k=v" string. |
211 | // Ex: <div {{1x|1=style='color:red'}}>foo</div> |
212 | && isset( $attr->value['html'] ) |
213 | ) { |
214 | return $this->htmlToWikitext( [ |
215 | 'env' => $this->env, |
216 | 'onSOL' => false, |
217 | 'inAttribute' => true, |
218 | ], $attr->value['html'] ); |
219 | } |
220 | } |
221 | return null; |
222 | } |
223 | |
224 | /** |
225 | * @param Element $node |
226 | * @param string $key |
227 | * @return array|null A tuple in {@link WTSUtils::getShadowInfo()} format, |
228 | * with an extra 'fromDataMW' flag. |
229 | */ |
230 | public function getAttributeValueAsShadowInfo( Element $node, string $key ): ?array { |
231 | $v = $this->getAttributeValue( $node, $key ); |
232 | if ( $v === null ) { |
233 | return $v; |
234 | } |
235 | return [ |
236 | 'value' => $v, |
237 | 'modified' => false, |
238 | 'fromsrc' => true, |
239 | 'fromDataMW' => true, |
240 | ]; |
241 | } |
242 | |
243 | /** |
244 | * @param Element $dataMWnode |
245 | * @param Element $htmlAttrNode |
246 | * @param string $key |
247 | * @return array A tuple in {@link WTSUtils::getShadowInfo()} format, |
248 | * possibly with an extra 'fromDataMW' flag. |
249 | */ |
250 | public function serializedImageAttrVal( |
251 | Element $dataMWnode, Element $htmlAttrNode, string $key |
252 | ): array { |
253 | $v = $this->getAttributeValueAsShadowInfo( $dataMWnode, $key ); |
254 | return $v ?: WTSUtils::getAttributeShadowInfo( $htmlAttrNode, $key ); |
255 | } |
256 | |
257 | public function serializedAttrVal( Element $node, string $name ): array { |
258 | return $this->serializedImageAttrVal( $node, $node, $name ); |
259 | } |
260 | |
261 | /** |
262 | * Check if token needs escaping |
263 | * |
264 | * @param string $name |
265 | * @return bool |
266 | */ |
267 | public function tagNeedsEscaping( string $name ): bool { |
268 | return WTUtils::isAnnOrExtTag( $this->env, $name ); |
269 | } |
270 | |
271 | public function wrapAngleBracket( Token $token, string $inner ): string { |
272 | if ( |
273 | $this->tagNeedsEscaping( $token->getName() ) && |
274 | !( |
275 | // Allow for html tags that shadow extension tags found in source |
276 | // to roundtrip. They only parse as html tags if they are unclosed, |
277 | // since extension tags bail on parsing without closing tags. |
278 | // |
279 | // This only applies when wrapAngleBracket() is being called for |
280 | // start tags, but we wouldn't be here if it was autoInsertedEnd |
281 | // anyways. |
282 | isset( Consts::$Sanitizer['AllowedLiteralTags'][$token->getName()] ) && |
283 | !empty( $token->dataParsoid->autoInsertedEnd ) |
284 | ) |
285 | ) { |
286 | return "<{$inner}>"; |
287 | } |
288 | return "<$inner>"; |
289 | } |
290 | |
291 | public function serializeHTMLTag( Element $node, bool $wrapperUnmodified ): string { |
292 | // TODO(arlolra): As of 1.3.0, html pre is considered an extension |
293 | // and wrapped in encapsulation. When that version is no longer |
294 | // accepted for serialization, we can remove this backwards |
295 | // compatibility code. |
296 | // |
297 | // 'inHTMLPre' flag has to be updated always, |
298 | // even when we are selsering in the wrapperUnmodified case. |
299 | $token = WTSUtils::mkTagTk( $node ); |
300 | if ( $token->getName() === 'pre' ) { |
301 | // html-syntax pre is very similar to nowiki |
302 | $this->state->inHTMLPre = true; |
303 | } |
304 | |
305 | if ( $wrapperUnmodified ) { |
306 | $dsr = DOMDataUtils::getDataParsoid( $node )->dsr; |
307 | return $this->state->getOrigSrc( $dsr->openRange() ) ?? ''; |
308 | } |
309 | |
310 | $da = $token->dataParsoid; |
311 | if ( !empty( $da->autoInsertedStart ) ) { |
312 | return ''; |
313 | } |
314 | |
315 | $close = ''; |
316 | if ( ( Utils::isVoidElement( $token->getName() ) && empty( $da->noClose ) ) || |
317 | !empty( $da->selfClose ) |
318 | ) { |
319 | $close = ' /'; |
320 | } |
321 | |
322 | $sAttribs = $this->serializeAttributes( $node, $token ); |
323 | if ( strlen( $sAttribs ) > 0 ) { |
324 | $sAttribs = ' ' . $sAttribs; |
325 | } |
326 | |
327 | // srcTagName cannot be '' so, it is okay to use ?? operator |
328 | $tokenName = $da->srcTagName ?? $token->getName(); |
329 | $inner = "{$tokenName}{$sAttribs}{$close}"; |
330 | return $this->wrapAngleBracket( $token, $inner ); |
331 | } |
332 | |
333 | /** |
334 | * @param Element $node |
335 | * @param bool $wrapperUnmodified |
336 | * @return string |
337 | */ |
338 | public function serializeHTMLEndTag( Element $node, $wrapperUnmodified ): string { |
339 | if ( $wrapperUnmodified ) { |
340 | $dsr = DOMDataUtils::getDataParsoid( $node )->dsr; |
341 | return $this->state->getOrigSrc( $dsr->closeRange() ) ?? ''; |
342 | } |
343 | |
344 | $token = WTSUtils::mkEndTagTk( $node ); |
345 | if ( $token->getName() === 'pre' ) { |
346 | $this->state->inHTMLPre = false; |
347 | } |
348 | |
349 | // srcTagName cannot be '' so, it is okay to use ?? operator |
350 | $tokenName = $token->dataParsoid->srcTagName ?? $token->getName(); |
351 | $ret = ''; |
352 | |
353 | if ( empty( $token->dataParsoid->autoInsertedEnd ) |
354 | && !Utils::isVoidElement( $token->getName() ) |
355 | && empty( $token->dataParsoid->selfClose ) |
356 | ) { |
357 | $ret = $this->wrapAngleBracket( $token, "/{$tokenName}" ); |
358 | } |
359 | |
360 | return $ret; |
361 | } |
362 | |
363 | public function serializeAttributes( Element $node, Token $token, bool $isWt = false ): string { |
364 | $attribs = $token->attribs; |
365 | |
366 | $out = []; |
367 | foreach ( $attribs as $kv ) { |
368 | // Tokens created during html2wt don't have nested tokens for keys. |
369 | // But, they could be integers but we want strings below. |
370 | $k = (string)$kv->k; |
371 | $v = null; |
372 | $vInfo = null; |
373 | |
374 | // Unconditionally ignore |
375 | // (all of the IGNORED_ATTRIBUTES should be filtered out earlier, |
376 | // but ignore them here too just to make sure.) |
377 | if ( isset( self::IGNORED_ATTRIBUTES[$k] ) || $k === 'data-mw' ) { |
378 | continue; |
379 | } |
380 | |
381 | // Ignore parsoid-like ids. They may have been left behind |
382 | // by clients and shouldn't be serialized. This can also happen |
383 | // in v2/v3 API when there is no matching data-parsoid entry found |
384 | // for this id. |
385 | if ( $k === 'id' && preg_match( '/^mw[\w-]{2,}$/D', $kv->v ) ) { |
386 | if ( WTUtils::isNewElt( $node ) ) { |
387 | // Parsoid id found on element without a matching data-parsoid. Drop it! |
388 | } else { |
389 | $vInfo = $token->getAttributeShadowInfo( $k ); |
390 | if ( !$vInfo['modified'] && $vInfo['fromsrc'] ) { |
391 | $out[] = $k . '=' . '"' . str_replace( '"', '"', $vInfo['value'] ) . '"'; |
392 | } |
393 | } |
394 | continue; |
395 | } |
396 | |
397 | // Parsoid auto-generates ids for headings and they should |
398 | // be stripped out, except if this is not auto-generated id. |
399 | if ( $k === 'id' && DOMUtils::isHeading( $node ) ) { |
400 | if ( !empty( DOMDataUtils::getDataParsoid( $node )->reusedId ) ) { |
401 | $vInfo = $token->getAttributeShadowInfo( $k ); |
402 | // PORT-FIXME: is this safe? value could be a token or token array |
403 | $out[] = $k . '="' . str_replace( '"', '"', $vInfo['value'] ) . '"'; |
404 | } |
405 | continue; |
406 | } |
407 | |
408 | // Strip Parsoid-inserted class="mw-empty-elt" attributes |
409 | if ( $k === 'class' |
410 | && isset( Consts::$Output['FlaggedEmptyElts'][DOMCompat::nodeName( $node )] ) |
411 | ) { |
412 | $kv->v = preg_replace( '/\bmw-empty-elt\b/', '', $kv->v, 1 ); |
413 | if ( !$kv->v ) { |
414 | continue; |
415 | } |
416 | } |
417 | |
418 | // Strip other Parsoid-generated values |
419 | // |
420 | // FIXME: Given that we are currently escaping about/typeof keys |
421 | // that show up in wikitext, we could unconditionally strip these |
422 | // away right now. |
423 | $parsoidValueRegExp = self::PARSOID_ATTRIBUTES[$k] ?? null; |
424 | if ( $parsoidValueRegExp && preg_match( $parsoidValueRegExp, $kv->v ) ) { |
425 | $v = preg_replace( $parsoidValueRegExp, '', $kv->v ); |
426 | if ( $v ) { |
427 | $out[] = $k . '="' . $v . '"'; |
428 | } |
429 | continue; |
430 | } |
431 | |
432 | if ( strlen( $k ) > 0 ) { |
433 | $vInfo = $token->getAttributeShadowInfo( $k ); |
434 | $v = $vInfo['value']; |
435 | // Deal with k/v's that were template-generated |
436 | $kk = $this->getAttributeKey( $node, $k ); |
437 | // Pass in $k, not $kk since $kk can potentially |
438 | // be original wikitext source for 'k' rather than |
439 | // the string value of the key. |
440 | $vv = $this->getAttributeValue( $node, $k ) ?? $v; |
441 | // Remove encapsulation from protected attributes |
442 | // in pegTokenizer.pegjs:generic_newline_attribute |
443 | $kk = preg_replace( '/^data-x-/i', '', $kk, 1 ); |
444 | // PORT-FIXME: is this type safe? $vv could be a ConstrainedText |
445 | if ( $vv !== null && strlen( $vv ) > 0 ) { |
446 | if ( !$vInfo['fromsrc'] && !$isWt ) { |
447 | // Escape wikitext entities |
448 | $vv = str_replace( '>', '>', Utils::escapeWtEntities( $vv ) ); |
449 | } |
450 | $out[] = $kk . '="' . str_replace( '"', '"', $vv ) . '"'; |
451 | } elseif ( preg_match( '/[{<]/', $kk ) ) { |
452 | // Templated, <*include*>, or <ext-tag> generated |
453 | $out[] = $kk; |
454 | } else { |
455 | $out[] = $kk . '=""'; |
456 | } |
457 | continue; |
458 | // PORT-FIXME: is this type safe? $k->v could be a Token or Token array |
459 | } elseif ( strlen( $kv->v ) ) { |
460 | // not very likely.. |
461 | $out[] = $kv->v; |
462 | } |
463 | } |
464 | |
465 | // SSS FIXME: It can be reasonably argued that we can permanently delete |
466 | // dangerous and unacceptable attributes in the interest of safety/security |
467 | // and the resultant dirty diffs should be acceptable. But, this is |
468 | // something to do in the future once we have passed the initial tests |
469 | // of parsoid acceptance. |
470 | // |
471 | // 'a' data attribs -- look for attributes that were removed |
472 | // as part of sanitization and add them back |
473 | $dataParsoid = $token->dataParsoid; |
474 | if ( isset( $dataParsoid->a ) && isset( $dataParsoid->sa ) ) { |
475 | $aKeys = array_keys( $dataParsoid->a ); |
476 | foreach ( $aKeys as $k ) { |
477 | // Attrib not present -- sanitized away! |
478 | if ( !KV::lookupKV( $attribs, (string)$k ) ) { |
479 | $v = $dataParsoid->sa[$k] ?? null; |
480 | // FIXME: The tokenizer and attribute shadowing currently |
481 | // don't make much effort towards distinguishing the use |
482 | // of HTML empty attribute syntax. We can derive whether |
483 | // empty attribute syntax was used from the attributes |
484 | // srcOffsets in the Sanitizer, from the key end position |
485 | // and value start position being different. |
486 | if ( $v !== null && $v !== '' ) { |
487 | $out[] = $k . '="' . str_replace( '"', '"', $v ) . '"'; |
488 | } else { |
489 | $out[] = $k; |
490 | } |
491 | } |
492 | } |
493 | } |
494 | // XXX: round-trip optional whitespace / line breaks etc |
495 | return implode( ' ', $out ); |
496 | } |
497 | |
498 | /** |
499 | * FIXME: Get rid of this function after content version 2.2.0 has expired from caches. |
500 | * |
501 | * @param Element $node |
502 | */ |
503 | public function handleLIHackIfApplicable( Element $node ): void { |
504 | $liHackSrc = DOMDataUtils::getDataParsoid( $node )->liHackSrc ?? null; |
505 | $prev = DiffDOMUtils::previousNonSepSibling( $node ); |
506 | |
507 | // If we are dealing with an LI hack, then we must ensure that |
508 | // we are dealing with either |
509 | // |
510 | // 1. A node with no previous sibling inside of a list. |
511 | // |
512 | // 2. A node whose previous sibling is a list element. |
513 | if ( $liHackSrc !== null |
514 | // Case 1 |
515 | && ( ( $prev === null && DOMUtils::isList( $node->parentNode ) ) |
516 | // Case 2 |
517 | || ( $prev !== null && DOMUtils::isListItem( $prev ) ) ) |
518 | ) { |
519 | $this->state->emitChunk( $liHackSrc, $node ); |
520 | } |
521 | } |
522 | |
523 | private function formatStringSubst( string $format, string $value, bool $forceTrim ): string { |
524 | // PORT-FIXME: JS is more agressive and removes various unicode whitespaces |
525 | // (most notably nbsp). Does that matter? |
526 | if ( $forceTrim ) { |
527 | $value = trim( $value ); |
528 | } |
529 | return preg_replace_callback( '/_+/', static function ( $m ) use ( $value ) { |
530 | if ( $value === '' ) { |
531 | return $value; |
532 | } |
533 | $hole = $m[0]; |
534 | $holeLen = strlen( $hole ); |
535 | $valueLen = mb_strlen( $value ); |
536 | return $holeLen <= $valueLen ? $value : $value . str_repeat( ' ', $holeLen - $valueLen ); |
537 | }, $format, 1 ); |
538 | } |
539 | |
540 | /** |
541 | * Generates a template parameter sort function that tries to preserve existing ordering |
542 | * but also to follow the order prescribed by the templatedata. |
543 | * @param array $dpArgInfo |
544 | * @param ?array $tplData |
545 | * @param array $dataMwKeys |
546 | * @return Closure |
547 | */ |
548 | private function createParamComparator( |
549 | array $dpArgInfo, ?array $tplData, array $dataMwKeys |
550 | ): Closure { |
551 | // Record order of parameters in new data-mw |
552 | $newOrder = []; |
553 | foreach ( $dataMwKeys as $i => $key ) { |
554 | $newOrder[$key] = [ 'order' => $i ]; |
555 | } |
556 | // Record order of parameters in templatedata (if present) |
557 | $tplDataOrder = []; |
558 | $aliasMap = []; |
559 | $keys = []; |
560 | if ( $tplData && isset( $tplData['paramOrder'] ) ) { |
561 | foreach ( $tplData['paramOrder'] as $i => $key ) { |
562 | $tplDataOrder[$key] = [ 'order' => $i ]; |
563 | $aliasMap[$key] = [ 'key' => $key, 'order' => -1 ]; |
564 | $keys[] = $key; |
565 | // Aliases have the same sort order as the main name. |
566 | $aliases = $tplData['params'][$key]['aliases'] ?? []; |
567 | foreach ( $aliases as $j => $alias ) { |
568 | $aliasMap[$alias] = [ 'key' => $key, 'order' => $j ]; |
569 | } |
570 | } |
571 | } |
572 | // Record order of parameters in original wikitext (from data-parsoid) |
573 | $origOrder = []; |
574 | foreach ( $dpArgInfo as $i => $argInfo ) { |
575 | $origOrder[$argInfo->k] = [ 'order' => $i, 'dist' => 0 ]; |
576 | } |
577 | // Canonical parameter key gets the same order as an alias parameter |
578 | // found in the original wikitext. |
579 | foreach ( $dpArgInfo as $i => $argInfo ) { |
580 | $canon = $aliasMap[$argInfo->k] ?? null; |
581 | if ( $canon !== null && !array_key_exists( $canon['key'], $origOrder ) ) { |
582 | $origOrder[$canon['key']] = $origOrder[$argInfo->k]; |
583 | } |
584 | } |
585 | // Find the closest "original parameter" for each templatedata parameter, |
586 | // so that newly-added parameters are placed near the parameters which |
587 | // templatedata says they should be adjacent to. |
588 | $nearestOrder = $origOrder; |
589 | $reduceF = static function ( $acc, $val ) use ( &$origOrder, &$nearestOrder ) { |
590 | if ( isset( $origOrder[$val] ) ) { |
591 | $acc = $origOrder[$val]; |
592 | } |
593 | if ( !( isset( $nearestOrder[$val] ) && $nearestOrder[$val]['dist'] < $acc['dist'] ) ) { |
594 | $nearestOrder[$val] = $acc; |
595 | } |
596 | return [ 'order' => $acc['order'], 'dist' => $acc['dist'] + 1 ]; |
597 | }; |
598 | // Find closest original parameter before the key. |
599 | // @phan-suppress-next-line PhanPluginUseReturnValueInternalKnown |
600 | array_reduce( $keys, $reduceF, [ 'order' => -1, 'dist' => 2 * count( $keys ) ] ); |
601 | // Find closest original parameter after the key. |
602 | // @phan-suppress-next-line PhanPluginUseReturnValueInternalKnown |
603 | array_reduce( array_reverse( $keys ), $reduceF, |
604 | [ 'order' => count( $origOrder ), 'dist' => count( $keys ) ] ); |
605 | |
606 | // Helper function to return a large number if the given key isn't |
607 | // in the sort order map |
608 | $big = max( count( $nearestOrder ), count( $newOrder ) ); |
609 | $defaultGet = static function ( $map, $key1, $key2 = null ) use ( &$big ) { |
610 | $key = ( !$key2 || isset( $map[$key1] ) ) ? $key1 : $key2; |
611 | return $map[$key]['order'] ?? $big; |
612 | }; |
613 | |
614 | return static function ( $a, $b ) use ( |
615 | &$aliasMap, &$defaultGet, &$nearestOrder, &$tplDataOrder, &$newOrder |
616 | ) { |
617 | $aCanon = $aliasMap[$a] ?? [ 'key' => $a, 'order' => -1 ]; |
618 | $bCanon = $aliasMap[$b] ?? [ 'key' => $b, 'order' => -1 ]; |
619 | // primary key is `nearestOrder` (nearest original parameter) |
620 | $aOrder = $defaultGet( $nearestOrder, $a, $aCanon['key'] ); |
621 | $bOrder = $defaultGet( $nearestOrder, $b, $bCanon['key'] ); |
622 | if ( $aOrder !== $bOrder ) { |
623 | return $aOrder - $bOrder; |
624 | } |
625 | // secondary key is templatedata order |
626 | if ( $aCanon['key'] === $bCanon['key'] ) { |
627 | return $aCanon['order'] - $bCanon['order']; |
628 | } |
629 | $aOrder = $defaultGet( $tplDataOrder, $aCanon['key'] ); |
630 | $bOrder = $defaultGet( $tplDataOrder, $bCanon['key'] ); |
631 | if ( $aOrder !== $bOrder ) { |
632 | return $aOrder - $bOrder; |
633 | } |
634 | // tertiary key is original input order (makes sort stable) |
635 | $aOrder = $defaultGet( $newOrder, $a ); |
636 | $bOrder = $defaultGet( $newOrder, $b ); |
637 | return $aOrder - $bOrder; |
638 | }; |
639 | } |
640 | |
641 | /** |
642 | * Serialize part of a templatelike expression. |
643 | * @param SerializerState $state |
644 | * @param string $buf |
645 | * @param Element $node |
646 | * @param TemplateInfo $part The expression fragment to serialize. See $srcParts |
647 | * in serializeFromParts() for format. |
648 | * @param ?array $tplData Templatedata, see |
649 | * https://github.com/wikimedia/mediawiki-extensions-TemplateData/blob/master/Specification.md |
650 | * @param string|TemplateInfo $prevPart Previous part. See $srcParts in serializeFromParts(). |
651 | * @param string|TemplateInfo $nextPart Next part. See $srcParts in serializeFromParts(). |
652 | * @return string |
653 | */ |
654 | private function serializePart( |
655 | SerializerState $state, string $buf, Element $node, TemplateInfo $part, |
656 | ?array $tplData, $prevPart, $nextPart |
657 | ): string { |
658 | // Parse custom format specification, if present. |
659 | $defaultBlockSpc = "{{_\n| _ = _\n}}"; // "block" |
660 | $defaultInlineSpc = '{{_|_=_}}'; // "inline" |
661 | |
662 | $format = isset( $tplData['format'] ) ? strtolower( $tplData['format'] ) : null; |
663 | if ( $format === 'block' ) { |
664 | $format = $defaultBlockSpc; |
665 | } elseif ( $format === 'inline' ) { |
666 | $format = $defaultInlineSpc; |
667 | } |
668 | // Check format string for validity. |
669 | preg_match( self::FORMATSTRING_REGEXP, $format ?? '', $parsedFormat ); |
670 | if ( !$parsedFormat ) { |
671 | preg_match( self::FORMATSTRING_REGEXP, $defaultInlineSpc, $parsedFormat ); |
672 | $format = null; // Indicates that no valid custom format was present. |
673 | } |
674 | $formatSOL = $parsedFormat[1] ?? ''; |
675 | $formatStart = $parsedFormat[2] ?? ''; |
676 | $formatParamName = $parsedFormat[3] ?? ''; |
677 | $formatParamValue = $parsedFormat[4] ?? ''; |
678 | $formatEnd = $parsedFormat[5] ?? ''; |
679 | $formatEOL = $parsedFormat[6] ?? ''; |
680 | $forceTrim = ( $format !== null ) || WTUtils::isNewElt( $node ); |
681 | |
682 | // Shoehorn formatting of top-level templatearg wikitext into this code. |
683 | if ( $part->type === 'templatearg' ) { |
684 | $formatStart = preg_replace( '/{{/', '{{{', $formatStart, 1 ); |
685 | $formatEnd = preg_replace( '/}}/', '}}}', $formatEnd, 1 ); |
686 | } |
687 | |
688 | // handle SOL newline requirement |
689 | if ( $formatSOL && !str_ends_with( ( $prevPart !== null ) ? $buf : ( $state->sep->src ?? '' ), "\n" ) ) { |
690 | $buf .= "\n"; |
691 | } |
692 | |
693 | // open the transclusion |
694 | $buf .= $this->formatStringSubst( $formatStart, $part->targetWt, $forceTrim ); |
695 | |
696 | // Short-circuit transclusions without params |
697 | $paramKeys = array_map( fn ( ParamInfo $pi ) => $pi->k, $part->paramInfos ); |
698 | if ( !$paramKeys ) { |
699 | if ( substr( $formatEnd, 0, 1 ) === "\n" ) { |
700 | $formatEnd = substr( $formatEnd, 1 ); |
701 | } |
702 | return $buf . $formatEnd; |
703 | } |
704 | |
705 | // Trim whitespace from data-mw keys to deal with non-compliant |
706 | // clients. Make sure param info is accessible for the stripped key |
707 | // since later code will be using the stripped key always. |
708 | $tplKeysFromDataMw = []; |
709 | foreach ( $part->paramInfos as $pi ) { |
710 | $strippedKey = trim( $pi->k ); |
711 | $tplKeysFromDataMw[$strippedKey] = $pi; |
712 | } |
713 | |
714 | // Per-parameter info from data-parsoid for pre-existing parameters |
715 | $dp = DOMDataUtils::getDataParsoid( $node ); |
716 | // Account for clients not setting the `i`, see T238721 |
717 | $dpArgInfo = isset( $part->i ) ? ( $dp->pi[$part->i] ?? [] ) : []; |
718 | |
719 | // Build a key -> arg info map |
720 | $dpArgInfoMap = []; |
721 | foreach ( $dpArgInfo as $info ) { |
722 | $dpArgInfoMap[$info->k] = $info; |
723 | } |
724 | |
725 | // 1. Process all parameters and build a map of |
726 | // arg-name -> [serializeAsNamed, name, value] |
727 | // |
728 | // 2. Serialize tpl args in required order |
729 | // |
730 | // 3. Format them according to formatParamName/formatParamValue |
731 | |
732 | $kvMap = []; |
733 | foreach ( $tplKeysFromDataMw as $key => $param ) { |
734 | // Storing keys in an array can turn them into ints; stringify. |
735 | $key = (string)$key; |
736 | $argInfo = $dpArgInfoMap[$key] ?? []; |
737 | |
738 | // TODO: Other formats? |
739 | // Only consider the html parameter if the wikitext one |
740 | // isn't present at all. If it's present but empty, |
741 | // that's still considered a valid parameter. |
742 | if ( $param->valueWt !== null ) { |
743 | $value = $param->valueWt; |
744 | } elseif ( $param->html !== null ) { |
745 | $value = $this->htmlToWikitext( [ 'env' => $this->env ], $param->html ); |
746 | } else { |
747 | $this->env->log( |
748 | 'error', |
749 | "params in data-mw part is missing wt/html for $key. " . |
750 | "Serializing as empty string.", |
751 | "data-mw part: " . json_encode( $part->toJsonArray() ) |
752 | ); |
753 | $value = ""; |
754 | } |
755 | |
756 | Assert::invariant( is_string( $value ), "For param: $key, wt property should be a string ' |
757 | . 'but got: $value" ); |
758 | |
759 | $serializeAsNamed = !empty( $argInfo->named ); |
760 | |
761 | // The name is usually equal to the parameter key, but |
762 | // if there's a key->wt attribute, use that. |
763 | $name = null; |
764 | if ( $param->keyWt !== null ) { |
765 | $name = $param->keyWt; |
766 | // And make it appear even if there wasn't any data-parsoid information. |
767 | $serializeAsNamed = true; |
768 | } else { |
769 | $name = $key; |
770 | } |
771 | |
772 | // Use 'k' as the key, not 'name'. |
773 | // |
774 | // The normalized form of 'k' is used as the key in both |
775 | // data-parsoid and data-mw. The full non-normalized form |
776 | // is present in '$param->keyWt' |
777 | $kvMap[$key] = [ 'serializeAsNamed' => $serializeAsNamed, 'name' => $name, 'value' => $value ]; |
778 | } |
779 | |
780 | $argOrder = array_keys( $kvMap ); |
781 | usort( $argOrder, $this->createParamComparator( $dpArgInfo, $tplData, $argOrder ) ); |
782 | |
783 | $argIndex = 1; |
784 | $numericIndex = 1; |
785 | |
786 | $numPositionalArgs = 0; |
787 | foreach ( $dpArgInfo as $pi ) { |
788 | if ( isset( $tplKeysFromDataMw[trim( $pi->k )] ) && empty( $pi->named ) ) { |
789 | $numPositionalArgs++; |
790 | } |
791 | } |
792 | |
793 | $argBuf = []; |
794 | foreach ( $argOrder as $param ) { |
795 | $kv = $kvMap[$param]; |
796 | // Add nowiki escapes for the arg value, as required |
797 | $escapedValue = $this->wteHandlers->escapeTplArgWT( $kv['value'], [ |
798 | 'serializeAsNamed' => $kv['serializeAsNamed'] || $param !== $numericIndex, |
799 | 'type' => $part->type, |
800 | 'argPositionalIndex' => $numericIndex, |
801 | 'numPositionalArgs' => $numPositionalArgs, |
802 | 'argIndex' => $argIndex++, |
803 | 'numArgs' => count( $tplKeysFromDataMw ), |
804 | ] ); |
805 | if ( $escapedValue['serializeAsNamed'] ) { |
806 | // WS trimming for values of named args |
807 | $argBuf[] = [ 'dpKey' => $param, 'name' => $kv['name'], 'value' => trim( $escapedValue['v'] ) ]; |
808 | } else { |
809 | $numericIndex++; |
810 | // No WS trimming for positional args |
811 | $argBuf[] = [ 'dpKey' => $param, 'name' => null, 'value' => $escapedValue['v'] ]; |
812 | } |
813 | } |
814 | |
815 | // If no explicit format is provided, default format is: |
816 | // - 'inline' for new args |
817 | // - whatever format is available from data-parsoid for old args |
818 | // (aka, overriding formatParamName/formatParamValue) |
819 | // |
820 | // If an unedited node OR if paramFormat is unspecified, |
821 | // this strategy prevents unnecessary normalization |
822 | // of edited transclusions which don't have valid |
823 | // templatedata formatting information. |
824 | |
825 | // "magic case": If the format string ends with a newline, an extra newline is added |
826 | // between the template name and the first parameter. |
827 | |
828 | foreach ( $argBuf as $arg ) { |
829 | $name = $arg['name']; |
830 | $val = $arg['value']; |
831 | if ( $name === null ) { |
832 | // We are serializing a positional parameter. |
833 | // Whitespace is significant for these and |
834 | // formatting would change semantics. |
835 | $name = ''; |
836 | $modFormatParamName = '|_'; |
837 | $modFormatParamValue = '_'; |
838 | } elseif ( $name === '' ) { |
839 | // No spacing for blank parameters ({{foo|=bar}}) |
840 | // This should be an edge case and probably only for |
841 | // inline-formatted templates, but we are consciously |
842 | // forcing this default here. Can revisit if this is |
843 | // ever a problem. |
844 | $modFormatParamName = '|_='; |
845 | $modFormatParamValue = '_'; |
846 | } else { |
847 | // Preserve existing spacing, esp if there was a comment |
848 | // embedded in it. Otherwise, follow TemplateData's lead. |
849 | // NOTE: In either case, we are forcibly normalizing |
850 | // non-block-formatted transclusions into block formats |
851 | // by adding missing newlines. |
852 | $spc = $dpArgInfoMap[$arg['dpKey']]->spc ?? null; |
853 | if ( $spc && ( !$format || preg_match( Utils::COMMENT_REGEXP, $spc[3] ?? '' ) ) ) { |
854 | $nl = ( substr( $formatParamName, 0, 1 ) === "\n" ) ? "\n" : ''; |
855 | $modFormatParamName = $nl . '|' . $spc[0] . '_' . $spc[1] . '=' . $spc[2]; |
856 | $modFormatParamValue = '_' . $spc[3]; |
857 | } else { |
858 | $modFormatParamName = $formatParamName; |
859 | $modFormatParamValue = $formatParamValue; |
860 | } |
861 | } |
862 | |
863 | // Don't create duplicate newlines. |
864 | $trailing = preg_match( self::TRAILING_COMMENT_OR_WS_AFTER_NL_REGEXP, $buf ); |
865 | if ( $trailing && substr( $formatParamName, 0, 1 ) === "\n" ) { |
866 | $modFormatParamName = substr( $formatParamName, 1 ); |
867 | } |
868 | |
869 | $buf .= $this->formatStringSubst( $modFormatParamName, $name, $forceTrim ); |
870 | $buf .= $this->formatStringSubst( $modFormatParamValue, $val, $forceTrim ); |
871 | } |
872 | |
873 | // Don't create duplicate newlines. |
874 | if ( preg_match( self::TRAILING_COMMENT_OR_WS_AFTER_NL_REGEXP, $buf ) |
875 | && substr( $formatEnd, 0, 1 ) === "\n" |
876 | ) { |
877 | $buf .= substr( $formatEnd, 1 ); |
878 | } else { |
879 | $buf .= $formatEnd; |
880 | } |
881 | |
882 | if ( $formatEOL ) { |
883 | if ( $nextPart === null ) { |
884 | // This is the last part of the block. Add the \n only |
885 | // if the next non-comment node is not a text node |
886 | // of if the text node doesn't have a leading \n. |
887 | $next = DiffDOMUtils::nextNonDeletedSibling( $node ); |
888 | while ( $next instanceof Comment ) { |
889 | $next = DiffDOMUtils::nextNonDeletedSibling( $next ); |
890 | } |
891 | if ( !( $next instanceof Text ) || substr( $next->nodeValue, 0, 1 ) !== "\n" ) { |
892 | $buf .= "\n"; |
893 | } |
894 | } elseif ( !is_string( $nextPart ) || substr( $nextPart, 0, 1 ) !== "\n" ) { |
895 | // If nextPart is another template, and it wants a leading nl, |
896 | // this \n we add here will count towards that because of the |
897 | // formatSOL check at the top. |
898 | $buf .= "\n"; |
899 | } |
900 | } |
901 | |
902 | return $buf; |
903 | } |
904 | |
905 | /** |
906 | * Serialize a template from its parts. |
907 | * @param SerializerState $state |
908 | * @param Element $node |
909 | * @param list<string|TemplateInfo> $srcParts Template parts |
910 | * @return string |
911 | */ |
912 | public function serializeFromParts( |
913 | SerializerState $state, Element $node, array $srcParts |
914 | ): string { |
915 | $useTplData = WTUtils::isNewElt( $node ) || DiffUtils::hasDiffMarkers( $node ); |
916 | $buf = ''; |
917 | foreach ( $srcParts as $i => $part ) { |
918 | if ( is_string( $part ) ) { |
919 | $buf .= $part; |
920 | continue; |
921 | } |
922 | |
923 | $prevPart = $srcParts[$i - 1] ?? null; |
924 | $nextPart = $srcParts[$i + 1] ?? null; |
925 | |
926 | if ( !isset( $part->targetWt ) ) { |
927 | // Maybe we should just raise a ClientError |
928 | $this->env->log( 'error', 'data-mw.parts array is malformed: ', |
929 | DOMCompat::getOuterHTML( $node ), PHPUtils::jsonEncode( $srcParts ) ); |
930 | continue; |
931 | } |
932 | |
933 | // Account for clients leaving off the params array, presumably when empty. |
934 | // See T291741 |
935 | $part->paramInfos ??= []; |
936 | |
937 | if ( $part->type === 'templatearg' ) { |
938 | $buf = $this->serializePart( |
939 | $state, $buf, $node, $part, null, $prevPart, |
940 | $nextPart |
941 | ); |
942 | continue; |
943 | } |
944 | |
945 | // transclusion: tpl or parser function? |
946 | // templates have $part->href |
947 | // parser functions have $part->func |
948 | |
949 | // While the API supports fetching multiple template data objects in one call, |
950 | // we will fetch one at a time to benefit from cached responses. |
951 | // |
952 | // Fetch template data for the template |
953 | $tplData = null; |
954 | $apiResp = null; |
955 | if ( isset( $part->href ) && $useTplData ) { |
956 | // Not a parser function |
957 | try { |
958 | $title = Title::newFromText( |
959 | PHPUtils::stripPrefix( Utils::decodeURIComponent( $part->href ), './' ), |
960 | $this->env->getSiteConfig() |
961 | ); |
962 | $tplData = $this->env->getDataAccess()->fetchTemplateData( $this->env->getPageConfig(), $title ); |
963 | } catch ( Exception $err ) { |
964 | // Log the error, and use default serialization mode. |
965 | // Better to misformat a transclusion than to lose an edit. |
966 | $this->env->log( 'error/html2wt/tpldata', $err ); |
967 | } |
968 | } |
969 | // If the template doesn't exist, or does but has no TemplateData, ignore it |
970 | if ( !empty( $tplData['missing'] ) || !empty( $tplData['notemplatedata'] ) ) { |
971 | $tplData = null; |
972 | } |
973 | $buf = $this->serializePart( $state, $buf, $node, $part, $tplData, $prevPart, $nextPart ); |
974 | } |
975 | return $buf; |
976 | } |
977 | |
978 | public function serializeExtensionStartTag( Element $node, SerializerState $state ): string { |
979 | $dataMw = DOMDataUtils::getDataMw( $node ); |
980 | $extTagName = $dataMw->name; |
981 | |
982 | // Serialize extension attributes in normalized form as: |
983 | // key='value' |
984 | // FIXME: with no dataParsoid, shadow info will mark it as new |
985 | $attrs = (array)( $dataMw->attrs ?? [] ); |
986 | $extTok = new TagTk( $extTagName, array_map( static function ( $key ) use ( $attrs ) { |
987 | return new KV( $key, $attrs[$key] ); |
988 | }, array_keys( $attrs ) ) ); |
989 | |
990 | $about = DOMCompat::getAttribute( $node, 'about' ); |
991 | if ( $about !== null ) { |
992 | $extTok->addAttribute( 'about', $about ); |
993 | } |
994 | $typeof = DOMCompat::getAttribute( $node, 'typeof' ); |
995 | if ( $typeof !== null ) { |
996 | $extTok->addAttribute( 'typeof', $typeof ); |
997 | } |
998 | |
999 | $attrStr = $this->serializeAttributes( $node, $extTok ); |
1000 | $src = '<' . $extTagName; |
1001 | if ( $attrStr ) { |
1002 | $src .= ' ' . $attrStr; |
1003 | } |
1004 | return $src . ( !empty( $dataMw->body ) ? '>' : ' />' ); |
1005 | } |
1006 | |
1007 | public function defaultExtensionHandler( Element $node, SerializerState $state ): string { |
1008 | $dp = DOMDataUtils::getDataParsoid( $node ); |
1009 | $dataMw = DOMDataUtils::getDataMw( $node ); |
1010 | $src = $this->serializeExtensionStartTag( $node, $state ); |
1011 | if ( !isset( $dataMw->body ) ) { |
1012 | return $src; // We self-closed this already. |
1013 | } elseif ( is_string( $dataMw->body->extsrc ?? null ) ) { |
1014 | $src .= $dataMw->body->extsrc; |
1015 | } elseif ( isset( $dp->src ) ) { |
1016 | $this->env->log( |
1017 | 'error/html2wt/ext', |
1018 | 'Extension data-mw missing for: ' . DOMCompat::getOuterHTML( $node ) |
1019 | ); |
1020 | return $dp->src; |
1021 | } else { |
1022 | $this->env->log( |
1023 | 'error/html2wt/ext', |
1024 | 'Extension src unavailable for: ' . DOMCompat::getOuterHTML( $node ) |
1025 | ); |
1026 | } |
1027 | return $src . '</' . $dataMw->name . '>'; |
1028 | } |
1029 | |
1030 | /** |
1031 | * Consolidate separator handling when emitting text. |
1032 | * @param string $res |
1033 | * @param Node $node |
1034 | */ |
1035 | private function serializeText( string $res, Node $node ): void { |
1036 | $state = $this->state; |
1037 | |
1038 | // Deal with trailing separator-like text (at least 1 newline and other whitespace) |
1039 | preg_match( self::$separatorREs['sepSuffixWithNlsRE'], $res, $newSepMatch ); |
1040 | $res = preg_replace( self::$separatorREs['sepSuffixWithNlsRE'], '', $res, 1 ); |
1041 | |
1042 | if ( !$state->inIndentPre ) { |
1043 | // Strip leading newlines and other whitespace |
1044 | if ( preg_match( self::$separatorREs['sepPrefixWithNlsRE'], $res, $match ) ) { |
1045 | $state->appendSep( $match[0] ); |
1046 | $res = substr( $res, strlen( $match[0] ) ); |
1047 | } |
1048 | } |
1049 | |
1050 | if ( $state->needsEscaping ) { |
1051 | $res = Utils::escapeWtEntities( $res ); |
1052 | } |
1053 | $state->emitChunk( $res, $node ); |
1054 | |
1055 | // Move trailing newlines into the next separator |
1056 | if ( $newSepMatch ) { |
1057 | if ( !$state->sep->src ) { |
1058 | $state->appendSep( $newSepMatch[0] ); |
1059 | } else { |
1060 | /* SSS FIXME: what are we doing with the stripped NLs?? */ |
1061 | } |
1062 | } |
1063 | } |
1064 | |
1065 | /** |
1066 | * Serialize the content of a text node |
1067 | * @param Node $node |
1068 | * @return Node|null |
1069 | */ |
1070 | private function serializeTextNode( Node $node ): ?Node { |
1071 | $this->state->needsEscaping = true; |
1072 | $this->serializeText( $node->nodeValue, $node ); |
1073 | $this->state->needsEscaping = false; |
1074 | return $node->nextSibling; |
1075 | } |
1076 | |
1077 | /** |
1078 | * Emit non-separator wikitext that does not need to be escaped. |
1079 | * @param string $res |
1080 | * @param Node $node |
1081 | */ |
1082 | public function emitWikitext( string $res, Node $node ): void { |
1083 | $this->serializeText( $res, $node ); |
1084 | } |
1085 | |
1086 | /** |
1087 | * DOM-based serialization |
1088 | * @param Element $node |
1089 | * @param DOMHandler $domHandler |
1090 | * @return Node|null |
1091 | */ |
1092 | private function serializeNodeInternal( Element $node, DOMHandler $domHandler ) { |
1093 | // To serialize a node from source, the node should satisfy these |
1094 | // conditions: |
1095 | // |
1096 | // 1. It should not have a diff marker or be in a modified subtree |
1097 | // WTS should not be in a subtree with a modification flag that |
1098 | // applies to every node of a subtree (rather than an indication |
1099 | // that some node in the subtree is modified). |
1100 | // |
1101 | // 2. It should continue to be valid in any surrounding edited context |
1102 | // For some nodes, modification of surrounding context |
1103 | // can change serialized output of this node |
1104 | // (ex: <td>s and whether you emit | or || for them) |
1105 | // |
1106 | // 3. It should have valid, usable DSR |
1107 | // |
1108 | // 4. Either it has non-zero positive DSR width, or meets one of the |
1109 | // following: |
1110 | // |
1111 | // 4a. It is content like <p><br/><p> or an automatically-inserted |
1112 | // wikitext <references/> (HTML <ol>) (will have dsr-width 0) |
1113 | // 4b. it is fostered content (will have dsr-width 0) |
1114 | // 4c. it is misnested content (will have dsr-width 0) |
1115 | // |
1116 | // SSS FIXME: Additionally, we can guard against buggy DSR with |
1117 | // some validity checks. We can test that non-sep src content |
1118 | // leading wikitext markup corresponds to the node type. |
1119 | // |
1120 | // Ex: If node.nodeName is 'UL', then src[0] should be '*' |
1121 | // |
1122 | // TO BE DONE |
1123 | |
1124 | $state = $this->state; |
1125 | $wrapperUnmodified = false; |
1126 | $dp = DOMDataUtils::getDataParsoid( $node ); |
1127 | |
1128 | if ( $state->selserMode |
1129 | && !$state->inInsertedContent |
1130 | && WTSUtils::origSrcValidInEditedContext( $state, $node ) |
1131 | && Utils::isValidDSR( $dp->dsr ?? null ) |
1132 | && ( $dp->dsr->end > $dp->dsr->start |
1133 | // FIXME: <p><br/></p> |
1134 | // nodes that have dsr width 0 because currently, |
1135 | // we emit newlines outside the p-nodes. So, this check |
1136 | // tries to handle that scenario. |
1137 | || ( |
1138 | $dp->dsr->end === $dp->dsr->start && ( |
1139 | in_array( DOMCompat::nodeName( $node ), [ 'p', 'br' ], true ) |
1140 | || !empty( DOMDataUtils::getDataMw( $node )->autoGenerated ) |
1141 | // FIXME: This is only necessary while outputContentVersion |
1142 | // 2.1.2 - 2.2.0 are still valid |
1143 | || DOMUtils::hasTypeOf( $node, 'mw:Placeholder/StrippedTag' ) |
1144 | ) |
1145 | ) |
1146 | || !empty( $dp->fostered ) |
1147 | || !empty( $dp->misnested ) |
1148 | ) |
1149 | ) { |
1150 | if ( !DiffUtils::hasDiffMarkers( $node ) ) { |
1151 | // If this HTML node will disappear in wikitext because of |
1152 | // zero width, then the separator constraints will carry over |
1153 | // to the node's children. |
1154 | // |
1155 | // Since we dont recurse into 'node' in selser mode, we update the |
1156 | // separator constraintInfo to apply to 'node' and its first child. |
1157 | // |
1158 | // We could clear constraintInfo altogether which would be |
1159 | // correct (but could normalize separators and introduce dirty |
1160 | // diffs unnecessarily). |
1161 | |
1162 | $state->currNodeUnmodified = true; |
1163 | |
1164 | if ( WTUtils::isZeroWidthWikitextElt( $node ) |
1165 | && $node->hasChildNodes() |
1166 | && ( $state->sep->constraints['constraintInfo']['sepType'] ?? null ) === 'sibling' |
1167 | ) { |
1168 | $state->sep->constraints['constraintInfo']['onSOL'] = $state->onSOL; |
1169 | $state->sep->constraints['constraintInfo']['sepType'] = 'parent-child'; |
1170 | $state->sep->constraints['constraintInfo']['nodeA'] = $node; |
1171 | $state->sep->constraints['constraintInfo']['nodeB'] = $node->firstChild; |
1172 | } |
1173 | |
1174 | $out = $state->getOrigSrc( $dp->dsr ) ?? ''; |
1175 | |
1176 | $this->trace( 'ORIG-src with DSR', static function () use ( $dp, $out ) { |
1177 | return '[' . $dp->dsr->start . ',' . $dp->dsr->end . '] = ' |
1178 | . PHPUtils::jsonEncode( $out ); |
1179 | } ); |
1180 | |
1181 | // When reusing source, we should only suppress serializing |
1182 | // to a single line for the cases we've allowed in normal serialization. |
1183 | // <a> tags might look surprising here, but, here is the rationale. |
1184 | // If some link syntax (wikilink, extlink, etc.) accepted a newline |
1185 | // originally, we can safely let it through here. There is no need to have |
1186 | // specific checks for wikilnks / extlinks / ... etc. The only concern is |
1187 | // if the surrounding context in which this link-syntax is embedded also |
1188 | // breaks the link syntax. There is no such syntax right now. |
1189 | // FIXME: Note the limitation here, that if these nodes are nested |
1190 | // in something as trivial as an i / b, the suppression won't happen |
1191 | // and we'll dirty the text. |
1192 | $suppressSLC = WTUtils::isFirstEncapsulationWrapperNode( $node ) |
1193 | || DOMUtils::hasTypeOf( $node, 'mw:Nowiki' ) |
1194 | || in_array( DOMCompat::nodeName( $node ), [ 'dl', 'ul', 'ol', 'a' ], true ) |
1195 | || ( DOMCompat::nodeName( $node ) === 'table' |
1196 | && DOMCompat::nodeName( $node->parentNode ) === 'dd' |
1197 | && DiffDOMUtils::previousNonSepSibling( $node ) === null ); |
1198 | |
1199 | // Use selser to serialize this text! The original |
1200 | // wikitext is `out`. But first allow |
1201 | // `ConstrainedText.fromSelSer` to figure out the right |
1202 | // type of ConstrainedText chunk(s) to use to represent |
1203 | // `out`, based on the node type. Since we might actually |
1204 | // have to break this wikitext into multiple chunks, |
1205 | // `fromSelSer` returns an array. |
1206 | if ( $suppressSLC ) { |
1207 | $state->singleLineContext->disable(); |
1208 | } |
1209 | foreach ( ConstrainedText::fromSelSer( $out, $node, $dp, $this->env ) as $ct ) { |
1210 | $state->emitChunk( $ct, $ct->node ); |
1211 | } |
1212 | if ( $suppressSLC ) { |
1213 | $state->singleLineContext->pop(); |
1214 | } |
1215 | |
1216 | // Skip over encapsulated content since it has already been |
1217 | // serialized. |
1218 | if ( WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
1219 | return WTUtils::skipOverEncapsulatedContent( $node ); |
1220 | } else { |
1221 | return $node->nextSibling; |
1222 | } |
1223 | } |
1224 | |
1225 | $wrapperUnmodified = DiffUtils::onlySubtreeChanged( $node ) && |
1226 | WTSUtils::hasValidTagWidths( $dp->dsr ?? null ); |
1227 | } |
1228 | |
1229 | $state->currNodeUnmodified = false; |
1230 | |
1231 | $currentInsertedState = $state->inInsertedContent; |
1232 | |
1233 | $inInsertedContent = $state->selserMode && DiffUtils::hasInsertedDiffMark( $node ); |
1234 | |
1235 | if ( $inInsertedContent ) { |
1236 | $state->inInsertedContent = true; |
1237 | } |
1238 | |
1239 | $next = $domHandler->handle( $node, $state, $wrapperUnmodified ); |
1240 | |
1241 | if ( $inInsertedContent ) { |
1242 | $state->inInsertedContent = $currentInsertedState; |
1243 | } |
1244 | |
1245 | return $next; |
1246 | } |
1247 | |
1248 | /** |
1249 | * Internal worker. Recursively serialize a DOM subtree. |
1250 | * @private |
1251 | * @param Node $node |
1252 | * @return ?Node |
1253 | */ |
1254 | public function serializeNode( Node $node ): ?Node { |
1255 | $nodeName = DOMCompat::nodeName( $node ); |
1256 | $domHandler = $method = null; |
1257 | $domHandlerFactory = new DOMHandlerFactory(); |
1258 | $state = $this->state; |
1259 | $state->currNode = $node; |
1260 | |
1261 | if ( $state->selserMode ) { |
1262 | $this->trace( |
1263 | static function () use ( $node ) { |
1264 | return WTSUtils::traceNodeName( $node ); |
1265 | }, |
1266 | '; prev-unmodified: ', $state->prevNodeUnmodified, |
1267 | '; SOL: ', $state->onSOL ); |
1268 | } else { |
1269 | $this->trace( |
1270 | static function () use ( $node ) { |
1271 | return WTSUtils::traceNodeName( $node ); |
1272 | }, |
1273 | '; SOL: ', $state->onSOL ); |
1274 | } |
1275 | |
1276 | switch ( $node->nodeType ) { |
1277 | case XML_ELEMENT_NODE: |
1278 | '@phan-var Element $node';/** @var Element $node */ |
1279 | // Ignore DiffMarker metas, but clear unmodified node state |
1280 | if ( DiffUtils::isDiffMarker( $node ) ) { |
1281 | $state->updateModificationFlags( $node ); |
1282 | // `state.sep.lastSourceNode` is cleared here so that removed |
1283 | // separators between otherwise unmodified nodes don't get |
1284 | // restored. |
1285 | $state->updateSep( $node ); |
1286 | return $node->nextSibling; |
1287 | } |
1288 | $domHandler = $domHandlerFactory->getDOMHandler( $node ); |
1289 | $method = [ $this, 'serializeNodeInternal' ]; |
1290 | break; |
1291 | case XML_TEXT_NODE: |
1292 | // This code assumes that the DOM is in normalized form with no |
1293 | // run of text nodes. |
1294 | // Accumulate whitespace from the text node into state.sep.src |
1295 | $text = $node->nodeValue; |
1296 | if ( !$state->inIndentPre |
1297 | // PORT-FIXME: original uses this->state->serializer->separatorREs |
1298 | // but that does not seem useful |
1299 | && preg_match( self::$separatorREs['pureSepRE'], $text ) |
1300 | ) { |
1301 | $state->appendSep( $text ); |
1302 | return $node->nextSibling; |
1303 | } |
1304 | if ( $state->selserMode ) { |
1305 | $prev = $node->previousSibling; |
1306 | if ( !$state->inInsertedContent && ( |
1307 | ( !$prev && DOMUtils::atTheTop( $node->parentNode ) ) || |
1308 | ( $prev && !DiffUtils::isDiffMarker( $prev ) ) |
1309 | ) ) { |
1310 | $state->currNodeUnmodified = true; |
1311 | } else { |
1312 | $state->currNodeUnmodified = false; |
1313 | } |
1314 | } |
1315 | |
1316 | $domHandler = new DOMHandler( false ); |
1317 | $method = [ $this, 'serializeTextNode' ]; |
1318 | break; |
1319 | case XML_COMMENT_NODE: |
1320 | // Merge this into separators |
1321 | $state->appendSep( WTSUtils::commentWT( $node->nodeValue ) ); |
1322 | return $node->nextSibling; |
1323 | default: |
1324 | throw new InternalException( 'Unhandled node type: ' . $node->nodeType ); |
1325 | } |
1326 | |
1327 | $prev = DiffDOMUtils::previousNonSepSibling( $node ) ?: $node->parentNode; |
1328 | $this->env->log( 'debug/wts', 'Before constraints for ' . $nodeName ); |
1329 | $state->separators->updateSeparatorConstraints( |
1330 | $prev, $domHandlerFactory->getDOMHandler( $prev ), |
1331 | $node, $domHandler |
1332 | ); |
1333 | |
1334 | $this->env->log( 'debug/wts', 'Calling serialization handler for ' . $nodeName ); |
1335 | $nextNode = call_user_func( $method, $node, $domHandler ); |
1336 | |
1337 | $next = DiffDOMUtils::nextNonSepSibling( $node ) ?: $node->parentNode; |
1338 | $this->env->log( 'debug/wts', 'After constraints for ' . $nodeName ); |
1339 | $state->separators->updateSeparatorConstraints( |
1340 | $node, $domHandler, |
1341 | $next, $domHandlerFactory->getDOMHandler( $next ) |
1342 | ); |
1343 | |
1344 | // Update modification flags |
1345 | $state->updateModificationFlags( $node ); |
1346 | |
1347 | return $nextNode; |
1348 | } |
1349 | |
1350 | private function stripUnnecessaryHeadingNowikis( string $line ): string { |
1351 | $state = $this->state; |
1352 | if ( !$state->hasHeadingEscapes ) { |
1353 | return $line; |
1354 | } |
1355 | |
1356 | $escaper = static function ( string $wt ) use ( $state ) { |
1357 | $ret = $state->serializer->wteHandlers->escapedText( $state, false, $wt, false, true ); |
1358 | return $ret; |
1359 | }; |
1360 | |
1361 | preg_match( self::HEADING_NOWIKI_REGEXP, $line, $match ); |
1362 | if ( $match && !preg_match( self::COMMENT_OR_WS_REGEXP, $match[2] ) ) { |
1363 | // The nowikiing was spurious since the trailing = is not in EOL position |
1364 | return $escaper( $match[1] ) . $match[2]; |
1365 | } else { |
1366 | // All is good. |
1367 | return $line; |
1368 | } |
1369 | } |
1370 | |
1371 | private function stripUnnecessaryIndentPreNowikis(): void { |
1372 | // FIXME: The solTransparentWikitextRegexp includes redirects, which really |
1373 | // only belong at the SOF and should be unique. See the "New redirect" test. |
1374 | $noWikiRegexp = '@^' |
1375 | . PHPUtils::reStrip( $this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp(), '@' ) |
1376 | . '((?i:<nowiki>\s+</nowiki>))([^\n]*(?:\n|$))' . '@Dm'; |
1377 | $pieces = preg_split( $noWikiRegexp, $this->state->out, -1, PREG_SPLIT_DELIM_CAPTURE ); |
1378 | $out = $pieces[0]; |
1379 | for ( $i = 1; $i < count( $pieces ); $i += 4 ) { |
1380 | $out .= $pieces[$i]; |
1381 | $nowiki = $pieces[$i + 1]; |
1382 | $rest = $pieces[$i + 2]; |
1383 | // Ignore comments |
1384 | preg_match_all( '/<[^!][^<>]*>/', $rest, $htmlTags ); |
1385 | |
1386 | // Not required if just sol transparent wt. |
1387 | $reqd = !preg_match( $this->env->getSiteConfig()->solTransparentWikitextRegexp(), $rest ); |
1388 | |
1389 | if ( $reqd ) { |
1390 | foreach ( $htmlTags[0] as $j => $rawTagName ) { |
1391 | // Strip </, attributes, and > to get the tagname |
1392 | $tagName = preg_replace( '/<\/?|\s.*|>/', '', $rawTagName ); |
1393 | if ( !isset( Consts::$HTML['HTML5Tags'][$tagName] ) ) { |
1394 | // If we encounter any tag that is not a html5 tag, |
1395 | // it could be an extension tag. We could do a more complex |
1396 | // regexp or tokenize the string to determine if any block tags |
1397 | // show up outside the extension tag. But, for now, we just |
1398 | // conservatively bail and leave the nowiki as is. |
1399 | $reqd = true; |
1400 | break; |
1401 | } elseif ( TokenUtils::isWikitextBlockTag( $tagName ) ) { |
1402 | // FIXME: Extension tags shadowing html5 tags might not |
1403 | // have block semantics. |
1404 | // Block tags on a line suppress nowikis |
1405 | $reqd = false; |
1406 | } |
1407 | } |
1408 | } |
1409 | |
1410 | if ( !$reqd ) { |
1411 | $nowiki = preg_replace( '#^<nowiki>(\s+)</nowiki>#', '$1', $nowiki, 1 ); |
1412 | } else { |
1413 | $solTransparentWikitextNoWsRegexpFragment = PHPUtils::reStrip( |
1414 | $this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp(), '/' ); |
1415 | $wsReplacementRE = '/^(' . $solTransparentWikitextNoWsRegexpFragment . ')\s+/'; |
1416 | // Replace all leading whitespace |
1417 | do { |
1418 | $oldRest = $rest; |
1419 | $rest = preg_replace( $wsReplacementRE, '$1', $rest ); |
1420 | } while ( $rest !== $oldRest ); |
1421 | |
1422 | // Protect against sol-sensitive wikitext characters |
1423 | $solCharsTest = '/^' . $solTransparentWikitextNoWsRegexpFragment . '[=*#:;]/'; |
1424 | $nowiki = preg_replace( '#^<nowiki>(\s+)</nowiki>#', |
1425 | preg_match( $solCharsTest, $rest ) ? '<nowiki/>' : '', $nowiki, 1 ); |
1426 | } |
1427 | $out = $out . $nowiki . $rest . $pieces[$i + 3]; |
1428 | } |
1429 | $this->state->out = $out; |
1430 | } |
1431 | |
1432 | /** |
1433 | * This implements a heuristic to strip two common sources of <nowiki/>s. |
1434 | * When <i> and <b> tags are matched up properly, |
1435 | * - any single ' char before <i> or <b> does not need <nowiki/> protection. |
1436 | * - any single ' char before </i> or </b> does not need <nowiki/> protection. |
1437 | * @param string $line |
1438 | * @return string |
1439 | */ |
1440 | private function stripUnnecessaryQuoteNowikis( string $line ): string { |
1441 | if ( !$this->state->hasQuoteNowikis ) { |
1442 | return $line; |
1443 | } |
1444 | |
1445 | // Optimization: We are interested in <nowiki/>s before quote chars. |
1446 | // So, skip this if we don't have both. |
1447 | if ( !( preg_match( '#<nowiki\s*/>#', $line ) && preg_match( "/'/", $line ) ) ) { |
1448 | return $line; |
1449 | } |
1450 | |
1451 | // * Split out all the [[ ]] {{ }} '' ''' ''''' <..> </...> |
1452 | // parens in the regexp mean that the split segments will |
1453 | // be spliced into the result array as the odd elements. |
1454 | // * If we match up the tags properly and we see opening |
1455 | // <i> / <b> / <i><b> tags preceded by a '<nowiki/>, we |
1456 | // can remove all those nowikis. |
1457 | // Ex: '<nowiki/>''foo'' bar '<nowiki/>'''baz''' |
1458 | // * If we match up the tags properly and we see closing |
1459 | // <i> / <b> / <i><b> tags preceded by a '<nowiki/>, we |
1460 | // can remove all those nowikis. |
1461 | // Ex: ''foo'<nowiki/>'' bar '''baz'<nowiki/>''' |
1462 | // phpcs:ignore Generic.Files.LineLength.TooLong |
1463 | $p = preg_split( "#('''''|'''|''|\[\[|\]\]|\{\{|\}\}|<\w+(?:\s+[^>]*?|\s*?)/?>|</\w+\s*>)#", $line, -1, PREG_SPLIT_DELIM_CAPTURE ); |
1464 | |
1465 | // Which nowiki do we strip out? |
1466 | $nowikiIndex = -1; |
1467 | |
1468 | // Verify that everything else is properly paired up. |
1469 | $stack = []; |
1470 | $quotesOnStack = 0; |
1471 | $n = count( $p ); |
1472 | $nonHtmlTag = null; |
1473 | for ( $j = 1; $j < $n; $j += 2 ) { |
1474 | // For HTML tags, pull out just the tag name for clearer code below. |
1475 | preg_match( '#^<(/?\w+)#', $p[$j], $matches ); |
1476 | $tag = mb_strtolower( $matches[1] ?? $p[$j] ); |
1477 | $tagLen = strlen( $tag ); |
1478 | $selfClose = false; |
1479 | if ( str_ends_with( $p[$j], '/>' ) ) { |
1480 | $tag .= '/'; |
1481 | $selfClose = true; |
1482 | } |
1483 | |
1484 | // Ignore non-html-tag (<nowiki> OR extension tag) blocks |
1485 | if ( !$nonHtmlTag ) { |
1486 | if ( isset( $this->env->getSiteConfig()->getExtensionTagNameMap()[$tag] ) ) { |
1487 | $nonHtmlTag = $tag; |
1488 | continue; |
1489 | } |
1490 | } else { |
1491 | if ( $tagLen > 0 && $tag[0] === '/' && substr( $tag, 1 ) === $nonHtmlTag ) { |
1492 | $nonHtmlTag = null; |
1493 | } |
1494 | continue; |
1495 | } |
1496 | |
1497 | if ( $tag === ']]' ) { |
1498 | if ( array_pop( $stack ) !== '[[' ) { |
1499 | return $line; |
1500 | } |
1501 | } elseif ( $tag === '}}' ) { |
1502 | if ( array_pop( $stack ) !== '{{' ) { |
1503 | return $line; |
1504 | } |
1505 | } elseif ( $tagLen > 0 && $tag[0] === '/' ) { // closing html tag |
1506 | // match html/ext tags |
1507 | $openTag = array_pop( $stack ); |
1508 | if ( $tag !== ( '/' . $openTag ) ) { |
1509 | return $line; |
1510 | } |
1511 | } elseif ( $tag === 'nowiki/' ) { |
1512 | // We only want to process: |
1513 | // - trailing single quotes (bar') |
1514 | // - or single quotes by themselves without a preceding '' sequence |
1515 | if ( substr( $p[$j - 1], -1 ) === "'" |
1516 | && !( $p[$j - 1] === "'" && $j > 1 && substr( $p[$j - 2], -2 ) === "''" ) |
1517 | // Consider <b>foo<i>bar'</i>baz</b> or <b>foo'<i>bar'</i>baz</b>. |
1518 | // The <nowiki/> before the <i> or </i> cannot be stripped |
1519 | // if the <i> is embedded inside another quote. |
1520 | && ( $quotesOnStack === 0 |
1521 | // The only strippable scenario with a single quote elt on stack |
1522 | // is: ''bar'<nowiki/>'' |
1523 | // -> ["", "''", "bar'", "<nowiki/>", "", "''"] |
1524 | || ( $quotesOnStack === 1 |
1525 | && $j + 2 < $n |
1526 | && $p[$j + 1] === '' |
1527 | && $p[$j + 2][0] === "'" |
1528 | && $p[$j + 2] === PHPUtils::lastItem( $stack ) ) ) |
1529 | ) { |
1530 | $nowikiIndex = $j; |
1531 | } |
1532 | continue; |
1533 | } elseif ( $selfClose || $tag === 'br' ) { |
1534 | // Skip over self-closing tags or what should have been self-closed. |
1535 | // ( While we could do this for all void tags defined in |
1536 | // mediawiki.wikitext.constants.js, <br> is the most common |
1537 | // culprit. ) |
1538 | continue; |
1539 | } elseif ( $tagLen > 0 && $tag[0] === "'" && PHPUtils::lastItem( $stack ) === $tag ) { |
1540 | array_pop( $stack ); |
1541 | $quotesOnStack--; |
1542 | } else { |
1543 | $stack[] = $tag; |
1544 | if ( $tagLen > 0 && $tag[0] === "'" ) { |
1545 | $quotesOnStack++; |
1546 | } |
1547 | } |
1548 | } |
1549 | |
1550 | if ( count( $stack ) ) { |
1551 | return $line; |
1552 | } |
1553 | |
1554 | if ( $nowikiIndex !== -1 ) { |
1555 | // We can only remove the final trailing nowiki. |
1556 | // |
1557 | // HTML : <i>'foo'</i> |
1558 | // line : ''<nowiki/>'foo'<nowiki/>'' |
1559 | $p[$nowikiIndex] = ''; |
1560 | return implode( '', $p ); |
1561 | } else { |
1562 | return $line; |
1563 | } |
1564 | } |
1565 | |
1566 | /** |
1567 | * Serialize an HTML DOM. |
1568 | * |
1569 | * WARNING: You probably want to use WikitextContentModelHandler::fromDOM instead. |
1570 | * |
1571 | * @param Document|DocumentFragment $node |
1572 | * @param bool $selserMode |
1573 | * @return string |
1574 | */ |
1575 | public function serializeDOM( |
1576 | Node $node, bool $selserMode = false |
1577 | ): string { |
1578 | Assert::parameterType( |
1579 | Document::class . '|' . DocumentFragment::class, |
1580 | $node, '$node' ); |
1581 | |
1582 | if ( $node instanceof Document ) { |
1583 | $node = DOMCompat::getBody( $node ); |
1584 | } |
1585 | |
1586 | $this->logType = $selserMode ? 'trace/selser' : 'trace/wts'; |
1587 | |
1588 | $state = $this->state; |
1589 | $state->initMode( $selserMode ); |
1590 | |
1591 | $domNormalizer = new DOMNormalizer( $state ); |
1592 | $domNormalizer->normalize( $node ); |
1593 | |
1594 | if ( $this->env->hasDumpFlag( 'dom:post-normal' ) ) { |
1595 | $options = [ 'storeDiffMark' => true ]; |
1596 | $this->env->writeDump( ContentUtils::dumpDOM( $node, 'DOM: post-normal', $options ) ); |
1597 | } |
1598 | |
1599 | $state->kickOffSerialize( $node ); |
1600 | |
1601 | if ( $state->hasIndentPreNowikis ) { |
1602 | // FIXME: Perhaps this can be done on a per-line basis |
1603 | // rather than do one post-pass on the entire document. |
1604 | $this->stripUnnecessaryIndentPreNowikis(); |
1605 | } |
1606 | |
1607 | $splitLines = $state->selserMode |
1608 | || $state->hasQuoteNowikis |
1609 | || $state->hasSelfClosingNowikis |
1610 | || $state->hasHeadingEscapes; |
1611 | |
1612 | if ( $splitLines ) { |
1613 | $state->out = implode( "\n", array_map( function ( $line ) { |
1614 | // FIXME: Perhaps this can be done on a per-line basis |
1615 | // rather than do one post-pass on the entire document. |
1616 | $line = $this->stripUnnecessaryQuoteNowikis( $line ); |
1617 | |
1618 | return $this->stripUnnecessaryHeadingNowikis( $line ); |
1619 | }, explode( "\n", $state->out ) ) ); |
1620 | } |
1621 | |
1622 | if ( $state->redirectText && $state->redirectText !== 'unbuffered' ) { |
1623 | $firstLine = explode( "\n", $state->out, 1 )[0]; |
1624 | $nl = preg_match( '/^(\s|$)/D', $firstLine ) ? '' : "\n"; |
1625 | $state->out = $state->redirectText . $nl . $state->out; |
1626 | } |
1627 | |
1628 | return $state->out; |
1629 | } |
1630 | |
1631 | /** |
1632 | * @note Porting note: this replaces the pattern $serializer->env->log( $serializer->logType, ... ) |
1633 | * @param mixed ...$args |
1634 | */ |
1635 | public function trace( ...$args ) { |
1636 | $this->env->log( $this->logType, ...$args ); |
1637 | } |
1638 | |
1639 | } |