Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 674 |
|
0.00% |
0 / 31 |
CRAP | |
0.00% |
0 / 1 |
| WikitextSerializer | |
0.00% |
0 / 674 |
|
0.00% |
0 / 31 |
73712 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
| linkHandler | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| languageVariantHandler | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| escapeWikitext | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| domToWikitext | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| htmlToWikitext | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
| getAttributeKey | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
20 | |||
| getAttributeValue | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
30 | |||
| getAttributeValueAsShadowInfo | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
6 | |||
| serializedImageAttrVal | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
| serializedAttrVal | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| tagNeedsEscaping | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| wrapAngleBracket | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
| serializeHTMLTag | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
72 | |||
| serializeHTMLEndTag | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
42 | |||
| serializeAttributes | |
0.00% |
0 / 56 |
|
0.00% |
0 / 1 |
992 | |||
| formatStringSubst | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
20 | |||
| createParamComparator | |
0.00% |
0 / 56 |
|
0.00% |
0 / 1 |
342 | |||
| serializePart | |
0.00% |
0 / 121 |
|
0.00% |
0 / 1 |
1892 | |||
| serializeFromParts | |
0.00% |
0 / 33 |
|
0.00% |
0 / 1 |
132 | |||
| serializeExtensionStartTag | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
30 | |||
| defaultExtensionHandler | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
20 | |||
| serializeText | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
42 | |||
| serializeTextNode | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
| emitWikitext | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| serializeNodeInternal | |
0.00% |
0 / 63 |
|
0.00% |
0 / 1 |
870 | |||
| serializeNode | |
0.00% |
0 / 62 |
|
0.00% |
0 / 1 |
306 | |||
| stripUnnecessaryHeadingNowikis | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
| stripUnnecessaryIndentPreNowikis | |
0.00% |
0 / 32 |
|
0.00% |
0 / 1 |
72 | |||
| stripUnnecessaryQuoteNowikis | |
0.00% |
0 / 63 |
|
0.00% |
0 / 1 |
1482 | |||
| serializeDOM | |
0.00% |
0 / 30 |
|
0.00% |
0 / 1 |
156 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Html2Wt; |
| 5 | |
| 6 | use Closure; |
| 7 | use Exception; |
| 8 | use Wikimedia\Assert\Assert; |
| 9 | use Wikimedia\Parsoid\Config\Env; |
| 10 | use Wikimedia\Parsoid\Core\InternalException; |
| 11 | use Wikimedia\Parsoid\DOM\Comment; |
| 12 | use Wikimedia\Parsoid\DOM\Document; |
| 13 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
| 14 | use Wikimedia\Parsoid\DOM\Element; |
| 15 | use Wikimedia\Parsoid\DOM\Node; |
| 16 | use Wikimedia\Parsoid\DOM\Text; |
| 17 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ConstrainedText; |
| 18 | use Wikimedia\Parsoid\Html2Wt\DOMHandlers\DOMHandler; |
| 19 | use Wikimedia\Parsoid\Html2Wt\DOMHandlers\DOMHandlerFactory; |
| 20 | use Wikimedia\Parsoid\NodeData\ParamInfo; |
| 21 | use Wikimedia\Parsoid\NodeData\TemplateInfo; |
| 22 | use Wikimedia\Parsoid\Tokens\KV; |
| 23 | use Wikimedia\Parsoid\Tokens\TagTk; |
| 24 | use Wikimedia\Parsoid\Tokens\Token; |
| 25 | use Wikimedia\Parsoid\Utils\ContentUtils; |
| 26 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
| 27 | use Wikimedia\Parsoid\Utils\DOMCompat; |
| 28 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 29 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 30 | use Wikimedia\Parsoid\Utils\PHPUtils; |
| 31 | use Wikimedia\Parsoid\Utils\Title; |
| 32 | use Wikimedia\Parsoid\Utils\TokenUtils; |
| 33 | use Wikimedia\Parsoid\Utils\Utils; |
| 34 | use Wikimedia\Parsoid\Utils\WTUtils; |
| 35 | use Wikimedia\Parsoid\Wikitext\Consts; |
| 36 | |
| 37 | /** |
| 38 | * Wikitext to HTML serializer. |
| 39 | * Serializes a chunk of tokens or an HTML DOM to MediaWiki's wikitext flavor. |
| 40 | * |
| 41 | * This serializer is designed to eventually |
| 42 | * - accept arbitrary HTML and |
| 43 | * - serialize that to wikitext in a way that round-trips back to the same |
| 44 | * HTML DOM as far as possible within the limitations of wikitext. |
| 45 | * |
| 46 | * Not much effort has been invested so far on supporting |
| 47 | * non-Parsoid/VE-generated HTML. Some of this involves adaptively switching |
| 48 | * between wikitext and HTML representations based on the values of attributes |
| 49 | * and DOM context. A few special cases are already handled adaptively |
| 50 | * (multi-paragraph list item contents are serialized as HTML tags for |
| 51 | * example, generic A elements are serialized to HTML A tags), but in general |
| 52 | * support for this is mostly missing. |
| 53 | * |
| 54 | * Example issue: |
| 55 | * ``` |
| 56 | * <h1><p>foo</p></h1> will serialize to =\nfoo\n= whereas the |
| 57 | * correct serialized output would be: =<p>foo</p>= |
| 58 | * ``` |
| 59 | * |
| 60 | * What to do about this? |
| 61 | * - add a generic 'can this HTML node be serialized to wikitext in this |
| 62 | * context' detection method and use that to adaptively switch between |
| 63 | * wikitext and HTML serialization. |
| 64 | * |
| 65 | */ |
| 66 | class WikitextSerializer { |
| 67 | |
| 68 | /** @var string[] */ |
| 69 | private const IGNORED_ATTRIBUTES = [ |
| 70 | 'data-parsoid' => true, |
| 71 | 'data-ve-changed' => true, |
| 72 | 'data-parsoid-changed' => true, |
| 73 | 'data-parsoid-diff' => true, |
| 74 | 'data-parsoid-serialize' => true, |
| 75 | DOMDataUtils::DATA_OBJECT_ATTR_NAME => true, |
| 76 | ]; |
| 77 | |
| 78 | /** @var string[] attribute name => value regexp */ |
| 79 | private const PARSOID_ATTRIBUTES = [ |
| 80 | 'about' => '/^#mwt\d+$/D', |
| 81 | 'typeof' => '/(^|\s)mw:\S+/', |
| 82 | ]; |
| 83 | |
| 84 | /** @var string Regexp */ |
| 85 | private const TRAILING_COMMENT_OR_WS_AFTER_NL_REGEXP |
| 86 | = '/\n(\s|' . Utils::COMMENT_REGEXP_FRAGMENT . ')*$/D'; |
| 87 | |
| 88 | /** @var string Regexp */ |
| 89 | private const FORMATSTRING_REGEXP = |
| 90 | '/^(\n)?(\{\{ *_+)(\n? *\|\n? *_+ *= *)(_+)(\n? *\}\})(\n)?$/D'; |
| 91 | |
| 92 | /** @var string Regexp for testing whether nowiki added around heading-like wikitext is needed */ |
| 93 | private const HEADING_NOWIKI_REGEXP = '/^(?:' . Utils::COMMENT_REGEXP_FRAGMENT . ')*' |
| 94 | . '<nowiki>(=+[^=]+=+)<\/nowiki>(.+)$/D'; |
| 95 | |
| 96 | /** @var array string[] */ |
| 97 | private static $separatorREs = [ |
| 98 | 'pureSepRE' => '/^[ \t\r\n]*$/D', |
| 99 | 'sepPrefixWithNlsRE' => '/^[ \t]*\n+[ \t\r\n]*/', |
| 100 | 'sepSuffixWithNlsRE' => '/\n[ \t\r\n]*$/D', |
| 101 | ]; |
| 102 | |
| 103 | /** @var WikitextEscapeHandlers */ |
| 104 | public $wteHandlers; |
| 105 | |
| 106 | /** @var Env */ |
| 107 | public $env; |
| 108 | |
| 109 | /** @var SerializerState */ |
| 110 | private $state; |
| 111 | |
| 112 | /** @var string Trace type for Env::trace() */ |
| 113 | public $logType; |
| 114 | |
| 115 | /** |
| 116 | * @param Env $env |
| 117 | * @param array $options List of options for serialization: |
| 118 | * - logType: (string) |
| 119 | * - extName: (string) |
| 120 | */ |
| 121 | public function __construct( Env $env, $options ) { |
| 122 | $this->env = $env; |
| 123 | $this->logType = $options['logType'] ?? 'wts'; |
| 124 | $this->state = new SerializerState( $this, $options ); |
| 125 | $this->wteHandlers = new WikitextEscapeHandlers( $env, $options['extName'] ?? null ); |
| 126 | } |
| 127 | |
| 128 | /** |
| 129 | * Main link handler. |
| 130 | * @param Element $node |
| 131 | * Used in multiple tag handlers (<a> and <link>), and hence added as top-level method |
| 132 | */ |
| 133 | public function linkHandler( Element $node ): void { |
| 134 | LinkHandlerUtils::linkHandler( $this->state, $node ); |
| 135 | } |
| 136 | |
| 137 | /** |
| 138 | * @param Element $node |
| 139 | */ |
| 140 | public function languageVariantHandler( Node $node ): void { |
| 141 | LanguageVariantHandler::handleLanguageVariant( $this->state, $node ); |
| 142 | } |
| 143 | |
| 144 | /** |
| 145 | * Escape wikitext-like strings in '$text' so that $text renders as a plain string |
| 146 | * when rendered as HTML. The escaping is done based on the context in which $text |
| 147 | * is present (ex: start-of-line, in a link, etc.) |
| 148 | * |
| 149 | * @param SerializerState $state |
| 150 | * @param string $text |
| 151 | * @param array $opts |
| 152 | * - node: (Node) |
| 153 | * - isLastChild: (bool) |
| 154 | * @return string |
| 155 | */ |
| 156 | public function escapeWikitext( SerializerState $state, string $text, array $opts ): string { |
| 157 | return $this->wteHandlers->escapeWikitext( $state, $text, $opts ); |
| 158 | } |
| 159 | |
| 160 | public function domToWikitext( |
| 161 | array $opts, DocumentFragment $node |
| 162 | ): string { |
| 163 | $opts['logType'] = $this->logType; |
| 164 | $serializer = new WikitextSerializer( $this->env, $opts ); |
| 165 | return $serializer->serializeDOM( $node ); |
| 166 | } |
| 167 | |
| 168 | public function htmlToWikitext( array $opts, string $html ): string { |
| 169 | $domFragment = ContentUtils::createAndLoadDocumentFragment( |
| 170 | $this->env->getTopLevelDoc(), $html, [ 'markNew' => true ] |
| 171 | ); |
| 172 | return $this->domToWikitext( $opts, $domFragment ); |
| 173 | } |
| 174 | |
| 175 | public function getAttributeKey( Element $node, string $key ): string { |
| 176 | $tplAttrs = DOMDataUtils::getDataMw( $node )->attribs ?? []; |
| 177 | foreach ( $tplAttrs as $attr ) { |
| 178 | // If this attribute's key is generated content, |
| 179 | // serialize HTML back to generator wikitext. |
| 180 | if ( ( $attr->key['txt'] ?? null ) === $key && isset( $attr->key['html'] ) ) { |
| 181 | return $this->htmlToWikitext( [ |
| 182 | 'env' => $this->env, |
| 183 | 'onSOL' => false, |
| 184 | ], $attr->key['html'] ); |
| 185 | } |
| 186 | } |
| 187 | return $key; |
| 188 | } |
| 189 | |
| 190 | /** |
| 191 | * @param Element $node |
| 192 | * @param string $key Attribute name. |
| 193 | * @return ?string The wikitext value, or null if the attribute is not present. |
| 194 | */ |
| 195 | public function getAttributeValue( Element $node, string $key ): ?string { |
| 196 | $tplAttrs = DOMDataUtils::getDataMw( $node )->attribs ?? []; |
| 197 | foreach ( $tplAttrs as $attr ) { |
| 198 | // If this attribute's value is generated content, |
| 199 | // serialize HTML back to generator wikitext. |
| 200 | // PORT-FIXME: not type safe. Need documentation on attrib format. |
| 201 | if ( ( $attr->key === $key || ( $attr->key['txt'] ?? null ) === $key ) |
| 202 | // Only return here if the value is generated (ie. .html), |
| 203 | // it may just be in .txt form. |
| 204 | // html:"" will serialize to "" and |
| 205 | // will be returned here. This is used to suppress the =".." |
| 206 | // string in the attribute in scenarios where the template |
| 207 | // generates a "k=v" string. |
| 208 | // Ex: <div {{1x|1=style='color:red'}}>foo</div> |
| 209 | && isset( $attr->value['html'] ) |
| 210 | ) { |
| 211 | return $this->htmlToWikitext( [ |
| 212 | 'env' => $this->env, |
| 213 | 'onSOL' => false, |
| 214 | 'inAttribute' => true, |
| 215 | ], $attr->value['html'] ); |
| 216 | } |
| 217 | } |
| 218 | return null; |
| 219 | } |
| 220 | |
| 221 | /** |
| 222 | * @param Element $node |
| 223 | * @param string $key |
| 224 | * @return array|null A tuple in {@link WTSUtils::getShadowInfo()} format, |
| 225 | * with an extra 'fromDataMW' flag. |
| 226 | */ |
| 227 | public function getAttributeValueAsShadowInfo( Element $node, string $key ): ?array { |
| 228 | $v = $this->getAttributeValue( $node, $key ); |
| 229 | if ( $v === null ) { |
| 230 | return $v; |
| 231 | } |
| 232 | return [ |
| 233 | 'value' => $v, |
| 234 | 'modified' => false, |
| 235 | 'fromsrc' => true, |
| 236 | 'fromDataMW' => true, |
| 237 | ]; |
| 238 | } |
| 239 | |
| 240 | /** |
| 241 | * @param Element $dataMWnode |
| 242 | * @param Element $htmlAttrNode |
| 243 | * @param string $key |
| 244 | * @return array A tuple in {@link WTSUtils::getShadowInfo()} format, |
| 245 | * possibly with an extra 'fromDataMW' flag. |
| 246 | */ |
| 247 | public function serializedImageAttrVal( |
| 248 | Element $dataMWnode, Element $htmlAttrNode, string $key |
| 249 | ): array { |
| 250 | $v = $this->getAttributeValueAsShadowInfo( $dataMWnode, $key ); |
| 251 | return $v ?: WTSUtils::getAttributeShadowInfo( $htmlAttrNode, $key ); |
| 252 | } |
| 253 | |
| 254 | public function serializedAttrVal( Element $node, string $name ): array { |
| 255 | return $this->serializedImageAttrVal( $node, $node, $name ); |
| 256 | } |
| 257 | |
| 258 | /** |
| 259 | * Check if token needs escaping |
| 260 | * |
| 261 | * @param string $name |
| 262 | * @return bool |
| 263 | */ |
| 264 | public function tagNeedsEscaping( string $name ): bool { |
| 265 | return WTUtils::isAnnOrExtTag( $this->env, $name ); |
| 266 | } |
| 267 | |
| 268 | public function wrapAngleBracket( Token $token, string $inner ): string { |
| 269 | if ( |
| 270 | $this->tagNeedsEscaping( $token->getName() ) && |
| 271 | !( |
| 272 | // Allow for html tags that shadow extension tags found in source |
| 273 | // to roundtrip. They only parse as html tags if they are unclosed, |
| 274 | // since extension tags bail on parsing without closing tags. |
| 275 | // |
| 276 | // This only applies when wrapAngleBracket() is being called for |
| 277 | // start tags, but we wouldn't be here if it was autoInsertedEnd |
| 278 | // anyways. |
| 279 | isset( Consts::$Sanitizer['AllowedLiteralTags'][$token->getName()] ) && |
| 280 | !empty( $token->dataParsoid->autoInsertedEnd ) |
| 281 | ) |
| 282 | ) { |
| 283 | return "<{$inner}>"; |
| 284 | } |
| 285 | return "<$inner>"; |
| 286 | } |
| 287 | |
| 288 | public function serializeHTMLTag( Element $node, bool $wrapperUnmodified ): string { |
| 289 | // TODO(arlolra): As of 1.3.0, html pre is considered an extension |
| 290 | // and wrapped in encapsulation. When that version is no longer |
| 291 | // accepted for serialization, we can remove this backwards |
| 292 | // compatibility code. |
| 293 | // |
| 294 | // 'inHTMLPre' flag has to be updated always, |
| 295 | // even when we are selsering in the wrapperUnmodified case. |
| 296 | $token = WTSUtils::mkTagTk( $node ); |
| 297 | if ( $token->getName() === 'pre' ) { |
| 298 | // html-syntax pre is very similar to nowiki |
| 299 | $this->state->inHTMLPre = true; |
| 300 | } |
| 301 | |
| 302 | if ( $wrapperUnmodified ) { |
| 303 | $dsr = DOMDataUtils::getDataParsoid( $node )->dsr; |
| 304 | return $this->state->getOrigSrc( $dsr->openRange() ) ?? ''; |
| 305 | } |
| 306 | |
| 307 | $da = $token->dataParsoid; |
| 308 | if ( !empty( $da->autoInsertedStart ) ) { |
| 309 | return ''; |
| 310 | } |
| 311 | |
| 312 | $close = ''; |
| 313 | if ( ( Utils::isVoidElement( $token->getName() ) && empty( $da->noClose ) ) || |
| 314 | !empty( $da->selfClose ) |
| 315 | ) { |
| 316 | $close = ' /'; |
| 317 | } |
| 318 | |
| 319 | $sAttribs = $this->serializeAttributes( $node, $token ); |
| 320 | if ( strlen( $sAttribs ) > 0 ) { |
| 321 | $sAttribs = ' ' . $sAttribs; |
| 322 | } |
| 323 | |
| 324 | // srcTagName cannot be '' so, it is okay to use ?? operator |
| 325 | $tokenName = $da->srcTagName ?? $token->getName(); |
| 326 | $inner = "{$tokenName}{$sAttribs}{$close}"; |
| 327 | return $this->wrapAngleBracket( $token, $inner ); |
| 328 | } |
| 329 | |
| 330 | /** |
| 331 | * @param Element $node |
| 332 | * @param bool $wrapperUnmodified |
| 333 | * @return string |
| 334 | */ |
| 335 | public function serializeHTMLEndTag( Element $node, $wrapperUnmodified ): string { |
| 336 | if ( $wrapperUnmodified ) { |
| 337 | $dsr = DOMDataUtils::getDataParsoid( $node )->dsr; |
| 338 | return $this->state->getOrigSrc( $dsr->closeRange() ) ?? ''; |
| 339 | } |
| 340 | |
| 341 | $token = WTSUtils::mkEndTagTk( $node ); |
| 342 | if ( $token->getName() === 'pre' ) { |
| 343 | $this->state->inHTMLPre = false; |
| 344 | } |
| 345 | |
| 346 | // srcTagName cannot be '' so, it is okay to use ?? operator |
| 347 | $tokenName = $token->dataParsoid->srcTagName ?? $token->getName(); |
| 348 | $ret = ''; |
| 349 | |
| 350 | if ( empty( $token->dataParsoid->autoInsertedEnd ) |
| 351 | && !Utils::isVoidElement( $token->getName() ) |
| 352 | && empty( $token->dataParsoid->selfClose ) |
| 353 | ) { |
| 354 | $ret = $this->wrapAngleBracket( $token, "/{$tokenName}" ); |
| 355 | } |
| 356 | |
| 357 | return $ret; |
| 358 | } |
| 359 | |
| 360 | public function serializeAttributes( Element $node, Token $token, bool $isWt = false ): string { |
| 361 | $attribs = $token->attribs; |
| 362 | |
| 363 | $out = []; |
| 364 | foreach ( $attribs as $kv ) { |
| 365 | // Tokens created during html2wt don't have nested tokens for keys. |
| 366 | // But, they could be integers but we want strings below. |
| 367 | $k = (string)$kv->k; |
| 368 | $v = null; |
| 369 | $vInfo = null; |
| 370 | |
| 371 | // Unconditionally ignore |
| 372 | // (all of the IGNORED_ATTRIBUTES should be filtered out earlier, |
| 373 | // but ignore them here too just to make sure.) |
| 374 | if ( isset( self::IGNORED_ATTRIBUTES[$k] ) || $k === 'data-mw' ) { |
| 375 | continue; |
| 376 | } |
| 377 | |
| 378 | // Ignore parsoid-like ids. They may have been left behind |
| 379 | // by clients and shouldn't be serialized. This can also happen |
| 380 | // in v2/v3 API when there is no matching data-parsoid entry found |
| 381 | // for this id. |
| 382 | if ( $k === 'id' && preg_match( '/^mw[\w-]{2,}$/D', $kv->v ) ) { |
| 383 | if ( WTUtils::isNewElt( $node ) ) { |
| 384 | // Parsoid id found on element without a matching data-parsoid. Drop it! |
| 385 | } else { |
| 386 | $vInfo = $token->getAttributeShadowInfo( $k ); |
| 387 | if ( !$vInfo['modified'] && $vInfo['fromsrc'] ) { |
| 388 | $out[] = $k . '=' . '"' . str_replace( '"', '"', $vInfo['value'] ) . '"'; |
| 389 | } |
| 390 | } |
| 391 | continue; |
| 392 | } |
| 393 | |
| 394 | // Parsoid auto-generates ids for headings and they should |
| 395 | // be stripped out, except if this is not auto-generated id. |
| 396 | if ( $k === 'id' && DOMUtils::isHeading( $node ) ) { |
| 397 | if ( !empty( DOMDataUtils::getDataParsoid( $node )->reusedId ) ) { |
| 398 | $vInfo = $token->getAttributeShadowInfo( $k ); |
| 399 | // PORT-FIXME: is this safe? value could be a token or token array |
| 400 | $out[] = $k . '="' . str_replace( '"', '"', $vInfo['value'] ) . '"'; |
| 401 | } |
| 402 | continue; |
| 403 | } |
| 404 | |
| 405 | // Strip Parsoid-inserted class="mw-empty-elt" attributes |
| 406 | if ( $k === 'class' |
| 407 | && isset( Consts::$Output['FlaggedEmptyElts'][DOMCompat::nodeName( $node )] ) |
| 408 | ) { |
| 409 | $kv->v = preg_replace( '/\bmw-empty-elt\b/', '', $kv->v, 1 ); |
| 410 | if ( !$kv->v ) { |
| 411 | continue; |
| 412 | } |
| 413 | } |
| 414 | |
| 415 | // Strip other Parsoid-generated values |
| 416 | // |
| 417 | // FIXME: Given that we are currently escaping about/typeof keys |
| 418 | // that show up in wikitext, we could unconditionally strip these |
| 419 | // away right now. |
| 420 | $parsoidValueRegExp = self::PARSOID_ATTRIBUTES[$k] ?? null; |
| 421 | if ( $parsoidValueRegExp && preg_match( $parsoidValueRegExp, $kv->v ) ) { |
| 422 | $v = preg_replace( $parsoidValueRegExp, '', $kv->v ); |
| 423 | if ( $v ) { |
| 424 | $out[] = $k . '="' . $v . '"'; |
| 425 | } |
| 426 | continue; |
| 427 | } |
| 428 | |
| 429 | if ( strlen( $k ) > 0 ) { |
| 430 | $vInfo = $token->getAttributeShadowInfo( $k ); |
| 431 | $v = $vInfo['value']; |
| 432 | // Deal with k/v's that were template-generated |
| 433 | $kk = $this->getAttributeKey( $node, $k ); |
| 434 | // Pass in $k, not $kk since $kk can potentially |
| 435 | // be original wikitext source for 'k' rather than |
| 436 | // the string value of the key. |
| 437 | $vv = $this->getAttributeValue( $node, $k ) ?? $v; |
| 438 | // Remove encapsulation from protected attributes |
| 439 | // in pegTokenizer.pegjs:generic_newline_attribute |
| 440 | $kk = preg_replace( '/^data-x-/i', '', $kk, 1 ); |
| 441 | // PORT-FIXME: is this type safe? $vv could be a ConstrainedText |
| 442 | if ( $vv !== null && strlen( $vv ) > 0 ) { |
| 443 | if ( !$vInfo['fromsrc'] && !$isWt ) { |
| 444 | // Escape wikitext entities |
| 445 | $vv = str_replace( '>', '>', Utils::escapeWtEntities( $vv ) ); |
| 446 | } |
| 447 | $out[] = $kk . '="' . str_replace( '"', '"', $vv ) . '"'; |
| 448 | } elseif ( preg_match( '/[{<]/', $kk ) ) { |
| 449 | // Templated, <*include*>, or <ext-tag> generated |
| 450 | $out[] = $kk; |
| 451 | } else { |
| 452 | $out[] = $kk . '=""'; |
| 453 | } |
| 454 | continue; |
| 455 | // PORT-FIXME: is this type safe? $k->v could be a Token or Token array |
| 456 | } elseif ( strlen( $kv->v ) ) { |
| 457 | // not very likely.. |
| 458 | $out[] = $kv->v; |
| 459 | } |
| 460 | } |
| 461 | |
| 462 | // SSS FIXME: It can be reasonably argued that we can permanently delete |
| 463 | // dangerous and unacceptable attributes in the interest of safety/security |
| 464 | // and the resultant dirty diffs should be acceptable. But, this is |
| 465 | // something to do in the future once we have passed the initial tests |
| 466 | // of parsoid acceptance. |
| 467 | // |
| 468 | // 'a' data attribs -- look for attributes that were removed |
| 469 | // as part of sanitization and add them back |
| 470 | $dataParsoid = $token->dataParsoid; |
| 471 | if ( isset( $dataParsoid->a ) && isset( $dataParsoid->sa ) ) { |
| 472 | $aKeys = array_keys( $dataParsoid->a ); |
| 473 | foreach ( $aKeys as $k ) { |
| 474 | // Attrib not present -- sanitized away! |
| 475 | if ( !KV::lookupKV( $attribs, (string)$k ) ) { |
| 476 | $v = $dataParsoid->sa[$k] ?? null; |
| 477 | // FIXME: The tokenizer and attribute shadowing currently |
| 478 | // don't make much effort towards distinguishing the use |
| 479 | // of HTML empty attribute syntax. We can derive whether |
| 480 | // empty attribute syntax was used from the attributes |
| 481 | // srcOffsets in the Sanitizer, from the key end position |
| 482 | // and value start position being different. |
| 483 | if ( $v !== null && $v !== '' ) { |
| 484 | $out[] = $k . '="' . str_replace( '"', '"', $v ) . '"'; |
| 485 | } else { |
| 486 | $out[] = $k; |
| 487 | } |
| 488 | } |
| 489 | } |
| 490 | } |
| 491 | // XXX: round-trip optional whitespace / line breaks etc |
| 492 | return implode( ' ', $out ); |
| 493 | } |
| 494 | |
| 495 | private function formatStringSubst( string $format, string $value, bool $forceTrim ): string { |
| 496 | // PORT-FIXME: JS is more agressive and removes various unicode whitespaces |
| 497 | // (most notably nbsp). Does that matter? |
| 498 | if ( $forceTrim ) { |
| 499 | $value = trim( $value ); |
| 500 | } |
| 501 | return preg_replace_callback( '/_+/', static function ( $m ) use ( $value ) { |
| 502 | if ( $value === '' ) { |
| 503 | return $value; |
| 504 | } |
| 505 | $hole = $m[0]; |
| 506 | $holeLen = strlen( $hole ); |
| 507 | $valueLen = mb_strlen( $value ); |
| 508 | return $holeLen <= $valueLen ? $value : $value . str_repeat( ' ', $holeLen - $valueLen ); |
| 509 | }, $format, 1 ); |
| 510 | } |
| 511 | |
| 512 | /** |
| 513 | * Generates a template parameter sort function that tries to preserve existing ordering |
| 514 | * but also to follow the order prescribed by the templatedata. |
| 515 | * @param array $dpArgInfo |
| 516 | * @param ?array $tplData |
| 517 | * @param array $dataMwKeys |
| 518 | * @return Closure |
| 519 | */ |
| 520 | private function createParamComparator( |
| 521 | array $dpArgInfo, ?array $tplData, array $dataMwKeys |
| 522 | ): Closure { |
| 523 | // Record order of parameters in new data-mw |
| 524 | $newOrder = []; |
| 525 | foreach ( $dataMwKeys as $i => $key ) { |
| 526 | $newOrder[$key] = [ 'order' => $i ]; |
| 527 | } |
| 528 | // Record order of parameters in templatedata (if present) |
| 529 | $tplDataOrder = []; |
| 530 | $aliasMap = []; |
| 531 | $keys = []; |
| 532 | if ( $tplData && isset( $tplData['paramOrder'] ) ) { |
| 533 | foreach ( $tplData['paramOrder'] as $i => $key ) { |
| 534 | $tplDataOrder[$key] = [ 'order' => $i ]; |
| 535 | $aliasMap[$key] = [ 'key' => $key, 'order' => -1 ]; |
| 536 | $keys[] = $key; |
| 537 | // Aliases have the same sort order as the main name. |
| 538 | $aliases = $tplData['params'][$key]['aliases'] ?? []; |
| 539 | foreach ( $aliases as $j => $alias ) { |
| 540 | $aliasMap[$alias] = [ 'key' => $key, 'order' => $j ]; |
| 541 | } |
| 542 | } |
| 543 | } |
| 544 | // Record order of parameters in original wikitext (from data-parsoid) |
| 545 | $origOrder = []; |
| 546 | foreach ( $dpArgInfo as $i => $argInfo ) { |
| 547 | $origOrder[$argInfo->k] = [ 'order' => $i, 'dist' => 0 ]; |
| 548 | } |
| 549 | // Canonical parameter key gets the same order as an alias parameter |
| 550 | // found in the original wikitext. |
| 551 | foreach ( $dpArgInfo as $i => $argInfo ) { |
| 552 | $canon = $aliasMap[$argInfo->k] ?? null; |
| 553 | if ( $canon !== null && !array_key_exists( $canon['key'], $origOrder ) ) { |
| 554 | $origOrder[$canon['key']] = $origOrder[$argInfo->k]; |
| 555 | } |
| 556 | } |
| 557 | // Find the closest "original parameter" for each templatedata parameter, |
| 558 | // so that newly-added parameters are placed near the parameters which |
| 559 | // templatedata says they should be adjacent to. |
| 560 | $nearestOrder = $origOrder; |
| 561 | $reduceF = static function ( $acc, $val ) use ( &$origOrder, &$nearestOrder ) { |
| 562 | if ( isset( $origOrder[$val] ) ) { |
| 563 | $acc = $origOrder[$val]; |
| 564 | } |
| 565 | if ( !( isset( $nearestOrder[$val] ) && $nearestOrder[$val]['dist'] < $acc['dist'] ) ) { |
| 566 | $nearestOrder[$val] = $acc; |
| 567 | } |
| 568 | return [ 'order' => $acc['order'], 'dist' => $acc['dist'] + 1 ]; |
| 569 | }; |
| 570 | // Find closest original parameter before the key. |
| 571 | // @phan-suppress-next-line PhanPluginUseReturnValueInternalKnown |
| 572 | array_reduce( $keys, $reduceF, [ 'order' => -1, 'dist' => 2 * count( $keys ) ] ); |
| 573 | // Find closest original parameter after the key. |
| 574 | // @phan-suppress-next-line PhanPluginUseReturnValueInternalKnown |
| 575 | array_reduce( array_reverse( $keys ), $reduceF, |
| 576 | [ 'order' => count( $origOrder ), 'dist' => count( $keys ) ] ); |
| 577 | |
| 578 | // Helper function to return a large number if the given key isn't |
| 579 | // in the sort order map |
| 580 | $big = max( count( $nearestOrder ), count( $newOrder ) ); |
| 581 | $defaultGet = static function ( $map, $key1, $key2 = null ) use ( &$big ) { |
| 582 | $key = ( !$key2 || isset( $map[$key1] ) ) ? $key1 : $key2; |
| 583 | return $map[$key]['order'] ?? $big; |
| 584 | }; |
| 585 | |
| 586 | return static function ( $a, $b ) use ( |
| 587 | &$aliasMap, &$defaultGet, &$nearestOrder, &$tplDataOrder, &$newOrder |
| 588 | ) { |
| 589 | $aCanon = $aliasMap[$a] ?? [ 'key' => $a, 'order' => -1 ]; |
| 590 | $bCanon = $aliasMap[$b] ?? [ 'key' => $b, 'order' => -1 ]; |
| 591 | // primary key is `nearestOrder` (nearest original parameter) |
| 592 | $aOrder = $defaultGet( $nearestOrder, $a, $aCanon['key'] ); |
| 593 | $bOrder = $defaultGet( $nearestOrder, $b, $bCanon['key'] ); |
| 594 | if ( $aOrder !== $bOrder ) { |
| 595 | return $aOrder - $bOrder; |
| 596 | } |
| 597 | // secondary key is templatedata order |
| 598 | if ( $aCanon['key'] === $bCanon['key'] ) { |
| 599 | return $aCanon['order'] - $bCanon['order']; |
| 600 | } |
| 601 | $aOrder = $defaultGet( $tplDataOrder, $aCanon['key'] ); |
| 602 | $bOrder = $defaultGet( $tplDataOrder, $bCanon['key'] ); |
| 603 | if ( $aOrder !== $bOrder ) { |
| 604 | return $aOrder - $bOrder; |
| 605 | } |
| 606 | // tertiary key is original input order (makes sort stable) |
| 607 | $aOrder = $defaultGet( $newOrder, $a ); |
| 608 | $bOrder = $defaultGet( $newOrder, $b ); |
| 609 | return $aOrder - $bOrder; |
| 610 | }; |
| 611 | } |
| 612 | |
| 613 | /** |
| 614 | * Serialize part of a templatelike expression. |
| 615 | * @param SerializerState $state |
| 616 | * @param string $buf |
| 617 | * @param Element $node |
| 618 | * @param TemplateInfo $part The expression fragment to serialize. See $srcParts |
| 619 | * in serializeFromParts() for format. |
| 620 | * @param ?array $tplData Templatedata, see |
| 621 | * https://github.com/wikimedia/mediawiki-extensions-TemplateData/blob/master/Specification.md |
| 622 | * @param string|TemplateInfo $prevPart Previous part. See $srcParts in serializeFromParts(). |
| 623 | * @param string|TemplateInfo $nextPart Next part. See $srcParts in serializeFromParts(). |
| 624 | * @return string |
| 625 | */ |
| 626 | private function serializePart( |
| 627 | SerializerState $state, string $buf, Element $node, TemplateInfo $part, |
| 628 | ?array $tplData, $prevPart, $nextPart |
| 629 | ): string { |
| 630 | // Parse custom format specification, if present. |
| 631 | $defaultBlockSpc = "{{_\n| _ = _\n}}"; // "block" |
| 632 | $defaultInlineSpc = '{{_|_=_}}'; // "inline" |
| 633 | |
| 634 | $format = isset( $tplData['format'] ) ? strtolower( $tplData['format'] ) : null; |
| 635 | if ( $format === 'block' ) { |
| 636 | $format = $defaultBlockSpc; |
| 637 | } elseif ( $format === 'inline' ) { |
| 638 | $format = $defaultInlineSpc; |
| 639 | } |
| 640 | // Check format string for validity. |
| 641 | preg_match( self::FORMATSTRING_REGEXP, $format ?? '', $parsedFormat ); |
| 642 | if ( !$parsedFormat ) { |
| 643 | preg_match( self::FORMATSTRING_REGEXP, $defaultInlineSpc, $parsedFormat ); |
| 644 | $format = null; // Indicates that no valid custom format was present. |
| 645 | } |
| 646 | $formatSOL = $parsedFormat[1] ?? ''; |
| 647 | $formatStart = $parsedFormat[2] ?? ''; |
| 648 | $formatParamName = $parsedFormat[3] ?? ''; |
| 649 | $formatParamValue = $parsedFormat[4] ?? ''; |
| 650 | $formatEnd = $parsedFormat[5] ?? ''; |
| 651 | $formatEOL = $parsedFormat[6] ?? ''; |
| 652 | $forceTrim = ( $format !== null ) || WTUtils::isNewElt( $node ); |
| 653 | |
| 654 | // Shoehorn formatting of top-level templatearg wikitext into this code. |
| 655 | if ( $part->type === 'templatearg' ) { |
| 656 | $formatStart = preg_replace( '/{{/', '{{{', $formatStart, 1 ); |
| 657 | $formatEnd = preg_replace( '/}}/', '}}}', $formatEnd, 1 ); |
| 658 | } |
| 659 | |
| 660 | // handle SOL newline requirement |
| 661 | if ( $formatSOL && !str_ends_with( ( $prevPart !== null ) ? $buf : ( $state->sep->src ?? '' ), "\n" ) ) { |
| 662 | $buf .= "\n"; |
| 663 | } |
| 664 | |
| 665 | // open the transclusion |
| 666 | $buf .= $this->formatStringSubst( $formatStart, $part->targetWt, $forceTrim ); |
| 667 | |
| 668 | // Short-circuit transclusions without params |
| 669 | $paramKeys = array_map( static fn ( ParamInfo $pi ) => $pi->k, $part->paramInfos ); |
| 670 | if ( !$paramKeys ) { |
| 671 | if ( substr( $formatEnd, 0, 1 ) === "\n" ) { |
| 672 | $formatEnd = substr( $formatEnd, 1 ); |
| 673 | } |
| 674 | return $buf . $formatEnd; |
| 675 | } |
| 676 | |
| 677 | // Trim whitespace from data-mw keys to deal with non-compliant |
| 678 | // clients. Make sure param info is accessible for the stripped key |
| 679 | // since later code will be using the stripped key always. |
| 680 | $tplKeysFromDataMw = []; |
| 681 | foreach ( $part->paramInfos as $pi ) { |
| 682 | $strippedKey = trim( $pi->k ); |
| 683 | $tplKeysFromDataMw[$strippedKey] = $pi; |
| 684 | } |
| 685 | |
| 686 | // Per-parameter info from data-parsoid for pre-existing parameters |
| 687 | $dp = DOMDataUtils::getDataParsoid( $node ); |
| 688 | // Account for clients not setting the `i`, see T238721 |
| 689 | $dpArgInfo = $part->i !== null ? ( $dp->pi[$part->i] ?? [] ) : []; |
| 690 | |
| 691 | // Build a key -> arg info map |
| 692 | $dpArgInfoMap = []; |
| 693 | foreach ( $dpArgInfo as $info ) { |
| 694 | $dpArgInfoMap[$info->k] = $info; |
| 695 | } |
| 696 | |
| 697 | // 1. Process all parameters and build a map of |
| 698 | // arg-name -> [serializeAsNamed, name, value] |
| 699 | // |
| 700 | // 2. Serialize tpl args in required order |
| 701 | // |
| 702 | // 3. Format them according to formatParamName/formatParamValue |
| 703 | |
| 704 | $kvMap = []; |
| 705 | foreach ( $tplKeysFromDataMw as $key => $param ) { |
| 706 | // Storing keys in an array can turn them into ints; stringify. |
| 707 | $key = (string)$key; |
| 708 | $argInfo = $dpArgInfoMap[$key] ?? []; |
| 709 | |
| 710 | // TODO: Other formats? |
| 711 | // Only consider the html parameter if the wikitext one |
| 712 | // isn't present at all. If it's present but empty, |
| 713 | // that's still considered a valid parameter. |
| 714 | if ( $param->valueWt !== null ) { |
| 715 | $value = $param->valueWt; |
| 716 | } elseif ( $param->html !== null ) { |
| 717 | $value = $this->htmlToWikitext( [ 'env' => $this->env ], $param->html ); |
| 718 | } else { |
| 719 | $this->env->log( |
| 720 | 'error', |
| 721 | "params in data-mw part is missing wt/html for $key. " . |
| 722 | "Serializing as empty string.", |
| 723 | "data-mw part: " . json_encode( $part->toJsonArray() ) |
| 724 | ); |
| 725 | $value = ""; |
| 726 | } |
| 727 | |
| 728 | Assert::invariant( is_string( $value ), "For param: $key, wt property should be a string ' |
| 729 | . 'but got: $value" ); |
| 730 | |
| 731 | $serializeAsNamed = !empty( $argInfo->named ); |
| 732 | |
| 733 | // The name is usually equal to the parameter key, but |
| 734 | // if there's a key->wt attribute, use that. |
| 735 | $name = null; |
| 736 | if ( $param->keyWt !== null ) { |
| 737 | $name = $param->keyWt; |
| 738 | // And make it appear even if there wasn't any data-parsoid information. |
| 739 | $serializeAsNamed = true; |
| 740 | } else { |
| 741 | $name = $key; |
| 742 | } |
| 743 | |
| 744 | // Use 'k' as the key, not 'name'. |
| 745 | // |
| 746 | // The normalized form of 'k' is used as the key in both |
| 747 | // data-parsoid and data-mw. The full non-normalized form |
| 748 | // is present in '$param->keyWt' |
| 749 | $kvMap[$key] = [ 'serializeAsNamed' => $serializeAsNamed, 'name' => $name, 'value' => $value ]; |
| 750 | } |
| 751 | |
| 752 | $argOrder = array_keys( $kvMap ); |
| 753 | usort( $argOrder, $this->createParamComparator( $dpArgInfo, $tplData, $argOrder ) ); |
| 754 | |
| 755 | $argIndex = 1; |
| 756 | $numericIndex = 1; |
| 757 | |
| 758 | $numPositionalArgs = 0; |
| 759 | foreach ( $dpArgInfo as $pi ) { |
| 760 | if ( isset( $tplKeysFromDataMw[trim( $pi->k )] ) && empty( $pi->named ) ) { |
| 761 | $numPositionalArgs++; |
| 762 | } |
| 763 | } |
| 764 | |
| 765 | $argBuf = []; |
| 766 | foreach ( $argOrder as $param ) { |
| 767 | $kv = $kvMap[$param]; |
| 768 | // Add nowiki escapes for the arg value, as required |
| 769 | $escapedValue = $this->wteHandlers->escapeTplArgWT( $kv['value'], [ |
| 770 | 'serializeAsNamed' => $kv['serializeAsNamed'] || $param !== $numericIndex, |
| 771 | 'type' => $part->type, |
| 772 | 'argPositionalIndex' => $numericIndex, |
| 773 | 'numPositionalArgs' => $numPositionalArgs, |
| 774 | 'argIndex' => $argIndex++, |
| 775 | 'numArgs' => count( $tplKeysFromDataMw ), |
| 776 | ] ); |
| 777 | if ( $escapedValue['serializeAsNamed'] ) { |
| 778 | // WS trimming for values of named args |
| 779 | $argBuf[] = [ 'dpKey' => $param, 'name' => $kv['name'], 'value' => trim( $escapedValue['v'] ) ]; |
| 780 | } else { |
| 781 | $numericIndex++; |
| 782 | // No WS trimming for positional args |
| 783 | $argBuf[] = [ 'dpKey' => $param, 'name' => null, 'value' => $escapedValue['v'] ]; |
| 784 | } |
| 785 | } |
| 786 | |
| 787 | // If no explicit format is provided, default format is: |
| 788 | // - 'inline' for new args |
| 789 | // - whatever format is available from data-parsoid for old args |
| 790 | // (aka, overriding formatParamName/formatParamValue) |
| 791 | // |
| 792 | // If an unedited node OR if paramFormat is unspecified, |
| 793 | // this strategy prevents unnecessary normalization |
| 794 | // of edited transclusions which don't have valid |
| 795 | // templatedata formatting information. |
| 796 | |
| 797 | // "magic case": If the format string ends with a newline, an extra newline is added |
| 798 | // between the template name and the first parameter. |
| 799 | |
| 800 | foreach ( $argBuf as $arg ) { |
| 801 | $name = $arg['name']; |
| 802 | $val = $arg['value']; |
| 803 | if ( $name === null ) { |
| 804 | // We are serializing a positional parameter. |
| 805 | // Whitespace is significant for these and |
| 806 | // formatting would change semantics. |
| 807 | $name = ''; |
| 808 | $modFormatParamName = '|_'; |
| 809 | $modFormatParamValue = '_'; |
| 810 | } elseif ( $name === '' ) { |
| 811 | // No spacing for blank parameters ({{foo|=bar}}) |
| 812 | // This should be an edge case and probably only for |
| 813 | // inline-formatted templates, but we are consciously |
| 814 | // forcing this default here. Can revisit if this is |
| 815 | // ever a problem. |
| 816 | $modFormatParamName = '|_='; |
| 817 | $modFormatParamValue = '_'; |
| 818 | } else { |
| 819 | // Preserve existing spacing, esp if there was a comment |
| 820 | // embedded in it. Otherwise, follow TemplateData's lead. |
| 821 | // NOTE: In either case, we are forcibly normalizing |
| 822 | // non-block-formatted transclusions into block formats |
| 823 | // by adding missing newlines. |
| 824 | $spc = $dpArgInfoMap[$arg['dpKey']]->spc ?? null; |
| 825 | if ( $spc && ( !$format || preg_match( Utils::COMMENT_REGEXP, $spc[3] ?? '' ) ) ) { |
| 826 | $nl = ( substr( $formatParamName, 0, 1 ) === "\n" ) ? "\n" : ''; |
| 827 | $modFormatParamName = $nl . '|' . $spc[0] . '_' . $spc[1] . '=' . $spc[2]; |
| 828 | $modFormatParamValue = '_' . $spc[3]; |
| 829 | } else { |
| 830 | $modFormatParamName = $formatParamName; |
| 831 | $modFormatParamValue = $formatParamValue; |
| 832 | } |
| 833 | } |
| 834 | |
| 835 | // Don't create duplicate newlines. |
| 836 | $trailing = preg_match( self::TRAILING_COMMENT_OR_WS_AFTER_NL_REGEXP, $buf ); |
| 837 | if ( $trailing && substr( $formatParamName, 0, 1 ) === "\n" ) { |
| 838 | $modFormatParamName = substr( $formatParamName, 1 ); |
| 839 | } |
| 840 | |
| 841 | $buf .= $this->formatStringSubst( $modFormatParamName, $name, $forceTrim ); |
| 842 | $buf .= $this->formatStringSubst( $modFormatParamValue, $val, $forceTrim ); |
| 843 | } |
| 844 | |
| 845 | // Don't create duplicate newlines. |
| 846 | if ( preg_match( self::TRAILING_COMMENT_OR_WS_AFTER_NL_REGEXP, $buf ) |
| 847 | && substr( $formatEnd, 0, 1 ) === "\n" |
| 848 | ) { |
| 849 | $buf .= substr( $formatEnd, 1 ); |
| 850 | } else { |
| 851 | $buf .= $formatEnd; |
| 852 | } |
| 853 | |
| 854 | if ( $formatEOL ) { |
| 855 | if ( $nextPart === null ) { |
| 856 | // This is the last part of the block. Add the \n only |
| 857 | // if the next non-comment node is not a text node |
| 858 | // of if the text node doesn't have a leading \n. |
| 859 | $next = DiffDOMUtils::nextNonDeletedSibling( $node ); |
| 860 | while ( $next instanceof Comment ) { |
| 861 | $next = DiffDOMUtils::nextNonDeletedSibling( $next ); |
| 862 | } |
| 863 | if ( !( $next instanceof Text ) || substr( $next->nodeValue, 0, 1 ) !== "\n" ) { |
| 864 | $buf .= "\n"; |
| 865 | } |
| 866 | } elseif ( !is_string( $nextPart ) || substr( $nextPart, 0, 1 ) !== "\n" ) { |
| 867 | // If nextPart is another template, and it wants a leading nl, |
| 868 | // this \n we add here will count towards that because of the |
| 869 | // formatSOL check at the top. |
| 870 | $buf .= "\n"; |
| 871 | } |
| 872 | } |
| 873 | |
| 874 | return $buf; |
| 875 | } |
| 876 | |
| 877 | /** |
| 878 | * Serialize a template from its parts. |
| 879 | * @param SerializerState $state |
| 880 | * @param Element $node |
| 881 | * @param list<string|TemplateInfo> $srcParts Template parts |
| 882 | * @return string |
| 883 | */ |
| 884 | public function serializeFromParts( |
| 885 | SerializerState $state, Element $node, array $srcParts |
| 886 | ): string { |
| 887 | $useTplData = WTUtils::isNewElt( $node ) || DiffUtils::hasDiffMarkers( $node ); |
| 888 | $buf = ''; |
| 889 | foreach ( $srcParts as $i => $part ) { |
| 890 | if ( is_string( $part ) ) { |
| 891 | $buf .= $part; |
| 892 | continue; |
| 893 | } |
| 894 | |
| 895 | $prevPart = $srcParts[$i - 1] ?? null; |
| 896 | $nextPart = $srcParts[$i + 1] ?? null; |
| 897 | |
| 898 | if ( $part->targetWt === null ) { |
| 899 | // Maybe we should just raise a ClientError |
| 900 | $this->env->log( 'error', 'data-mw.parts array is malformed: ', |
| 901 | DOMCompat::getOuterHTML( $node ), PHPUtils::jsonEncode( $srcParts ) ); |
| 902 | continue; |
| 903 | } |
| 904 | |
| 905 | // Account for clients leaving off the params array, presumably when empty. |
| 906 | // See T291741 |
| 907 | $part->paramInfos ??= []; |
| 908 | |
| 909 | if ( $part->type === 'templatearg' ) { |
| 910 | $buf = $this->serializePart( |
| 911 | $state, $buf, $node, $part, null, $prevPart, |
| 912 | $nextPart |
| 913 | ); |
| 914 | continue; |
| 915 | } |
| 916 | |
| 917 | // transclusion: tpl or parser function? |
| 918 | // templates have $part->href |
| 919 | // parser functions have $part->func |
| 920 | |
| 921 | // While the API supports fetching multiple template data objects in one call, |
| 922 | // we will fetch one at a time to benefit from cached responses. |
| 923 | // |
| 924 | // Fetch template data for the template |
| 925 | $tplData = null; |
| 926 | $apiResp = null; |
| 927 | if ( $part->href !== null && $useTplData ) { |
| 928 | // Not a parser function |
| 929 | try { |
| 930 | $title = Title::newFromText( |
| 931 | PHPUtils::stripPrefix( Utils::decodeURIComponent( $part->href ), './' ), |
| 932 | $this->env->getSiteConfig() |
| 933 | ); |
| 934 | $tplData = $this->env->getDataAccess()->fetchTemplateData( $this->env->getPageConfig(), $title ); |
| 935 | } catch ( Exception $err ) { |
| 936 | // Log the error, and use default serialization mode. |
| 937 | // Better to misformat a transclusion than to lose an edit. |
| 938 | $this->env->log( 'error/html2wt/tpldata', $err ); |
| 939 | } |
| 940 | } |
| 941 | // If the template doesn't exist, or does but has no TemplateData, ignore it |
| 942 | if ( !empty( $tplData['missing'] ) || !empty( $tplData['notemplatedata'] ) ) { |
| 943 | $tplData = null; |
| 944 | } |
| 945 | $buf = $this->serializePart( $state, $buf, $node, $part, $tplData, $prevPart, $nextPart ); |
| 946 | } |
| 947 | return $buf; |
| 948 | } |
| 949 | |
| 950 | public function serializeExtensionStartTag( Element $node, SerializerState $state ): string { |
| 951 | $dataMw = DOMDataUtils::getDataMw( $node ); |
| 952 | $extTagName = $dataMw->name; |
| 953 | |
| 954 | // Serialize extension attributes in normalized form as: |
| 955 | // key='value' |
| 956 | // FIXME: with no dataParsoid, shadow info will mark it as new |
| 957 | $attrs = $dataMw->getExtAttribs() ?? []; |
| 958 | $extTok = new TagTk( $extTagName, array_map( static function ( $key ) use ( $attrs ) { |
| 959 | // explicit conversion to string because PHP will convert to int |
| 960 | // if $key is numeric |
| 961 | return new KV( (string)$key, $attrs[$key] ); |
| 962 | }, array_keys( $attrs ) ) ); |
| 963 | |
| 964 | $about = DOMCompat::getAttribute( $node, 'about' ); |
| 965 | if ( $about !== null ) { |
| 966 | $extTok->addAttribute( 'about', $about ); |
| 967 | } |
| 968 | $typeof = DOMCompat::getAttribute( $node, 'typeof' ); |
| 969 | if ( $typeof !== null ) { |
| 970 | $extTok->addAttribute( 'typeof', $typeof ); |
| 971 | } |
| 972 | |
| 973 | $attrStr = $this->serializeAttributes( $node, $extTok ); |
| 974 | $src = '<' . $extTagName; |
| 975 | if ( $attrStr ) { |
| 976 | $src .= ' ' . $attrStr; |
| 977 | } |
| 978 | return $src . ( isset( $dataMw->body ) ? '>' : ' />' ); |
| 979 | } |
| 980 | |
| 981 | public function defaultExtensionHandler( Element $node, SerializerState $state ): string { |
| 982 | $dp = DOMDataUtils::getDataParsoid( $node ); |
| 983 | $dataMw = DOMDataUtils::getDataMw( $node ); |
| 984 | $src = $this->serializeExtensionStartTag( $node, $state ); |
| 985 | if ( !isset( $dataMw->body ) ) { |
| 986 | return $src; // We self-closed this already. |
| 987 | } elseif ( is_string( $dataMw->body->extsrc ?? null ) ) { |
| 988 | $src .= $dataMw->body->extsrc; |
| 989 | } elseif ( isset( $dp->src ) ) { |
| 990 | $this->env->log( |
| 991 | 'error/html2wt/ext', |
| 992 | 'Extension data-mw missing for: ' . DOMCompat::getOuterHTML( $node ) |
| 993 | ); |
| 994 | return $dp->src; |
| 995 | } else { |
| 996 | $this->env->log( |
| 997 | 'error/html2wt/ext', |
| 998 | 'Extension src unavailable for: ' . DOMCompat::getOuterHTML( $node ) |
| 999 | ); |
| 1000 | } |
| 1001 | return $src . '</' . $dataMw->name . '>'; |
| 1002 | } |
| 1003 | |
| 1004 | /** |
| 1005 | * Consolidate separator handling when emitting text. |
| 1006 | * @param string $res |
| 1007 | * @param Node $node |
| 1008 | */ |
| 1009 | private function serializeText( string $res, Node $node ): void { |
| 1010 | $state = $this->state; |
| 1011 | |
| 1012 | // Deal with trailing separator-like text (at least 1 newline and other whitespace) |
| 1013 | preg_match( self::$separatorREs['sepSuffixWithNlsRE'], $res, $newSepMatch ); |
| 1014 | $res = preg_replace( self::$separatorREs['sepSuffixWithNlsRE'], '', $res, 1 ); |
| 1015 | |
| 1016 | if ( !$state->inIndentPre ) { |
| 1017 | // Strip leading newlines and other whitespace |
| 1018 | if ( preg_match( self::$separatorREs['sepPrefixWithNlsRE'], $res, $match ) ) { |
| 1019 | $state->appendSep( $match[0] ); |
| 1020 | $res = substr( $res, strlen( $match[0] ) ); |
| 1021 | } |
| 1022 | } |
| 1023 | |
| 1024 | if ( $state->needsEscaping ) { |
| 1025 | $res = Utils::escapeWtEntities( $res ); |
| 1026 | } |
| 1027 | $state->emitChunk( $res, $node ); |
| 1028 | |
| 1029 | // Move trailing newlines into the next separator |
| 1030 | if ( $newSepMatch ) { |
| 1031 | if ( !$state->sep->src ) { |
| 1032 | $state->appendSep( $newSepMatch[0] ); |
| 1033 | } else { |
| 1034 | /* SSS FIXME: what are we doing with the stripped NLs?? */ |
| 1035 | } |
| 1036 | } |
| 1037 | } |
| 1038 | |
| 1039 | /** |
| 1040 | * Serialize the content of a text node |
| 1041 | * @param Node $node |
| 1042 | * @return Node|null |
| 1043 | */ |
| 1044 | private function serializeTextNode( Node $node ): ?Node { |
| 1045 | $this->state->needsEscaping = true; |
| 1046 | $this->serializeText( $node->nodeValue, $node ); |
| 1047 | $this->state->needsEscaping = false; |
| 1048 | return $node->nextSibling; |
| 1049 | } |
| 1050 | |
| 1051 | /** |
| 1052 | * Emit non-separator wikitext that does not need to be escaped. |
| 1053 | * @param string $res |
| 1054 | * @param Node $node |
| 1055 | */ |
| 1056 | public function emitWikitext( string $res, Node $node ): void { |
| 1057 | $this->serializeText( $res, $node ); |
| 1058 | } |
| 1059 | |
| 1060 | /** |
| 1061 | * DOM-based serialization |
| 1062 | * @param Element $node |
| 1063 | * @param DOMHandler $domHandler |
| 1064 | * @return Node|null |
| 1065 | */ |
| 1066 | private function serializeNodeInternal( Element $node, DOMHandler $domHandler ) { |
| 1067 | // To serialize a node from source, the node should satisfy these |
| 1068 | // conditions: |
| 1069 | // |
| 1070 | // 1. It should not have a diff marker or be in a modified subtree |
| 1071 | // WTS should not be in a subtree with a modification flag that |
| 1072 | // applies to every node of a subtree (rather than an indication |
| 1073 | // that some node in the subtree is modified). |
| 1074 | // |
| 1075 | // 2. It should continue to be valid in any surrounding edited context |
| 1076 | // For some nodes, modification of surrounding context |
| 1077 | // can change serialized output of this node |
| 1078 | // (ex: <td>s and whether you emit | or || for them) |
| 1079 | // |
| 1080 | // 3. It should have valid, usable DSR |
| 1081 | // |
| 1082 | // 4. Either it has non-zero positive DSR width, or meets one of the |
| 1083 | // following: |
| 1084 | // |
| 1085 | // 4a. It is content like <p><br/><p> or an automatically-inserted |
| 1086 | // wikitext <references/> (HTML <ol>) (will have dsr-width 0) |
| 1087 | // 4b. it is fostered content (will have dsr-width 0) |
| 1088 | // 4c. it is misnested content (will have dsr-width 0) |
| 1089 | // |
| 1090 | // SSS FIXME: Additionally, we can guard against buggy DSR with |
| 1091 | // some validity checks. We can test that non-sep src content |
| 1092 | // leading wikitext markup corresponds to the node type. |
| 1093 | // |
| 1094 | // Ex: If node.nodeName is 'UL', then src[0] should be '*' |
| 1095 | // |
| 1096 | // TO BE DONE |
| 1097 | |
| 1098 | $state = $this->state; |
| 1099 | $wrapperUnmodified = false; |
| 1100 | $dp = DOMDataUtils::getDataParsoid( $node ); |
| 1101 | |
| 1102 | if ( $state->selserMode |
| 1103 | && !$state->inInsertedContent |
| 1104 | && WTSUtils::origSrcValidInEditedContext( $state, $node ) |
| 1105 | && Utils::isValidDSR( $dp->dsr ?? null ) |
| 1106 | && ( $dp->dsr->end > $dp->dsr->start |
| 1107 | // FIXME: <p><br/></p> |
| 1108 | // nodes that have dsr width 0 because currently, |
| 1109 | // we emit newlines outside the p-nodes. So, this check |
| 1110 | // tries to handle that scenario. |
| 1111 | || ( |
| 1112 | $dp->dsr->end === $dp->dsr->start && ( |
| 1113 | in_array( DOMCompat::nodeName( $node ), [ 'p', 'br' ], true ) |
| 1114 | || !empty( DOMDataUtils::getDataMw( $node )->autoGenerated ) |
| 1115 | // FIXME: This is only necessary while outputContentVersion |
| 1116 | // 2.1.2 - 2.2.0 are still valid |
| 1117 | || DOMUtils::hasTypeOf( $node, 'mw:Placeholder/StrippedTag' ) |
| 1118 | ) |
| 1119 | ) |
| 1120 | || !empty( $dp->fostered ) |
| 1121 | || !empty( $dp->misnested ) |
| 1122 | ) |
| 1123 | ) { |
| 1124 | if ( !DiffUtils::hasDiffMarkers( $node ) ) { |
| 1125 | // If this HTML node will disappear in wikitext because of |
| 1126 | // zero width, then the separator constraints will carry over |
| 1127 | // to the node's children. |
| 1128 | // |
| 1129 | // Since we dont recurse into 'node' in selser mode, we update the |
| 1130 | // separator constraintInfo to apply to 'node' and its first child. |
| 1131 | // |
| 1132 | // We could clear constraintInfo altogether which would be |
| 1133 | // correct (but could normalize separators and introduce dirty |
| 1134 | // diffs unnecessarily). |
| 1135 | |
| 1136 | $state->currNodeUnmodified = true; |
| 1137 | |
| 1138 | if ( WTUtils::isZeroWidthWikitextElt( $node ) |
| 1139 | && $node->hasChildNodes() |
| 1140 | && ( $state->sep->constraints['constraintInfo']['sepType'] ?? null ) === 'sibling' |
| 1141 | ) { |
| 1142 | $state->sep->constraints['constraintInfo']['onSOL'] = $state->onSOL; |
| 1143 | $state->sep->constraints['constraintInfo']['sepType'] = 'parent-child'; |
| 1144 | $state->sep->constraints['constraintInfo']['nodeA'] = $node; |
| 1145 | $state->sep->constraints['constraintInfo']['nodeB'] = $node->firstChild; |
| 1146 | } |
| 1147 | |
| 1148 | $out = $state->getOrigSrc( $dp->dsr ) ?? ''; |
| 1149 | |
| 1150 | $this->env->trace( |
| 1151 | $this->logType, |
| 1152 | 'ORIG-src with DSR', $dp->dsr, ' = ', $out |
| 1153 | ); |
| 1154 | |
| 1155 | // When reusing source, we should only suppress serializing |
| 1156 | // to a single line for the cases we've allowed in normal serialization. |
| 1157 | // <a> tags might look surprising here, but, here is the rationale. |
| 1158 | // If some link syntax (wikilink, extlink, etc.) accepted a newline |
| 1159 | // originally, we can safely let it through here. There is no need to have |
| 1160 | // specific checks for wikilnks / extlinks / ... etc. The only concern is |
| 1161 | // if the surrounding context in which this link-syntax is embedded also |
| 1162 | // breaks the link syntax. There is no such syntax right now. |
| 1163 | // FIXME: Note the limitation here, that if these nodes are nested |
| 1164 | // in something as trivial as an i / b, the suppression won't happen |
| 1165 | // and we'll dirty the text. |
| 1166 | $suppressSLC = WTUtils::isFirstEncapsulationWrapperNode( $node ) |
| 1167 | || DOMUtils::hasTypeOf( $node, 'mw:Nowiki' ) |
| 1168 | || in_array( DOMCompat::nodeName( $node ), [ 'dl', 'ul', 'ol', 'a' ], true ) |
| 1169 | || ( DOMCompat::nodeName( $node ) === 'table' |
| 1170 | && DOMCompat::nodeName( $node->parentNode ) === 'dd' |
| 1171 | && DiffDOMUtils::previousNonSepSibling( $node ) === null ); |
| 1172 | |
| 1173 | // Use selser to serialize this text! The original |
| 1174 | // wikitext is `out`. But first allow |
| 1175 | // `ConstrainedText.fromSelSer` to figure out the right |
| 1176 | // type of ConstrainedText chunk(s) to use to represent |
| 1177 | // `out`, based on the node type. Since we might actually |
| 1178 | // have to break this wikitext into multiple chunks, |
| 1179 | // `fromSelSer` returns an array. |
| 1180 | if ( $suppressSLC ) { |
| 1181 | $state->singleLineContext->disable(); |
| 1182 | } |
| 1183 | foreach ( ConstrainedText::fromSelSer( $out, $node, $dp, $this->env ) as $ct ) { |
| 1184 | $state->emitChunk( $ct, $ct->node ); |
| 1185 | } |
| 1186 | if ( $suppressSLC ) { |
| 1187 | $state->singleLineContext->pop(); |
| 1188 | } |
| 1189 | |
| 1190 | // Skip over encapsulated content since it has already been |
| 1191 | // serialized. |
| 1192 | if ( WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
| 1193 | return WTUtils::skipOverEncapsulatedContent( $node ); |
| 1194 | } else { |
| 1195 | return $node->nextSibling; |
| 1196 | } |
| 1197 | } |
| 1198 | |
| 1199 | $wrapperUnmodified = DiffUtils::onlySubtreeChanged( $node ) && |
| 1200 | WTSUtils::hasValidTagWidths( $dp->dsr ?? null ); |
| 1201 | } |
| 1202 | |
| 1203 | $state->currNodeUnmodified = false; |
| 1204 | |
| 1205 | $currentInsertedState = $state->inInsertedContent; |
| 1206 | |
| 1207 | $inInsertedContent = $state->selserMode && DiffUtils::hasInsertedDiffMark( $node ); |
| 1208 | |
| 1209 | if ( $inInsertedContent ) { |
| 1210 | $state->inInsertedContent = true; |
| 1211 | } |
| 1212 | |
| 1213 | $next = $domHandler->handle( $node, $state, $wrapperUnmodified ); |
| 1214 | |
| 1215 | if ( $inInsertedContent ) { |
| 1216 | $state->inInsertedContent = $currentInsertedState; |
| 1217 | } |
| 1218 | |
| 1219 | return $next; |
| 1220 | } |
| 1221 | |
| 1222 | /** |
| 1223 | * Internal worker. Recursively serialize a DOM subtree. |
| 1224 | * @private |
| 1225 | * @param Node $node |
| 1226 | * @return ?Node |
| 1227 | */ |
| 1228 | public function serializeNode( Node $node ): ?Node { |
| 1229 | $nodeName = DOMCompat::nodeName( $node ); |
| 1230 | $domHandler = $method = null; |
| 1231 | $domHandlerFactory = new DOMHandlerFactory(); |
| 1232 | $state = $this->state; |
| 1233 | $state->currNode = $node; |
| 1234 | |
| 1235 | if ( $state->selserMode ) { |
| 1236 | $this->env->trace( |
| 1237 | $this->logType, |
| 1238 | static fn () => WTSUtils::traceNodeName( $node ), |
| 1239 | '; prev-unmodified: ', $state->prevNodeUnmodified, |
| 1240 | '; SOL: ', $state->onSOL |
| 1241 | ); |
| 1242 | } else { |
| 1243 | $this->env->trace( |
| 1244 | $this->logType, |
| 1245 | static fn () => WTSUtils::traceNodeName( $node ), |
| 1246 | '; SOL: ', $state->onSOL |
| 1247 | ); |
| 1248 | } |
| 1249 | |
| 1250 | switch ( $node->nodeType ) { |
| 1251 | case XML_ELEMENT_NODE: |
| 1252 | '@phan-var Element $node';/** @var Element $node */ |
| 1253 | // Ignore DiffMarker metas, but clear unmodified node state |
| 1254 | if ( DiffUtils::isDiffMarker( $node ) ) { |
| 1255 | $state->updateModificationFlags( $node ); |
| 1256 | // `state.sep.lastSourceNode` is cleared here so that removed |
| 1257 | // separators between otherwise unmodified nodes don't get |
| 1258 | // restored. |
| 1259 | $state->updateSep( $node ); |
| 1260 | return $node->nextSibling; |
| 1261 | } |
| 1262 | $domHandler = $domHandlerFactory->getDOMHandler( $node ); |
| 1263 | $method = [ $this, 'serializeNodeInternal' ]; |
| 1264 | break; |
| 1265 | case XML_TEXT_NODE: |
| 1266 | // This code assumes that the DOM is in normalized form with no |
| 1267 | // run of text nodes. |
| 1268 | // Accumulate whitespace from the text node into state.sep.src |
| 1269 | $text = $node->nodeValue; |
| 1270 | if ( !$state->inIndentPre |
| 1271 | // PORT-FIXME: original uses this->state->serializer->separatorREs |
| 1272 | // but that does not seem useful |
| 1273 | && preg_match( self::$separatorREs['pureSepRE'], $text ) |
| 1274 | ) { |
| 1275 | $state->appendSep( $text ); |
| 1276 | return $node->nextSibling; |
| 1277 | } |
| 1278 | if ( $state->selserMode ) { |
| 1279 | $prev = $node->previousSibling; |
| 1280 | if ( !$state->inInsertedContent && ( |
| 1281 | ( !$prev && DOMUtils::atTheTop( $node->parentNode ) ) || |
| 1282 | ( $prev && !DiffUtils::isDiffMarker( $prev ) ) |
| 1283 | ) ) { |
| 1284 | $state->currNodeUnmodified = true; |
| 1285 | } else { |
| 1286 | $state->currNodeUnmodified = false; |
| 1287 | } |
| 1288 | } |
| 1289 | |
| 1290 | $domHandler = new DOMHandler( false ); |
| 1291 | $method = [ $this, 'serializeTextNode' ]; |
| 1292 | break; |
| 1293 | case XML_COMMENT_NODE: |
| 1294 | // Merge this into separators |
| 1295 | $state->appendSep( WTSUtils::commentWT( $node->nodeValue ) ); |
| 1296 | return $node->nextSibling; |
| 1297 | default: |
| 1298 | throw new InternalException( 'Unhandled node type: ' . $node->nodeType ); |
| 1299 | } |
| 1300 | |
| 1301 | $prev = DiffDOMUtils::previousNonSepSibling( $node ) ?: $node->parentNode; |
| 1302 | $this->env->log( 'debug/wts', 'Before constraints for ' . $nodeName ); |
| 1303 | $state->separators->updateSeparatorConstraints( |
| 1304 | $prev, $domHandlerFactory->getDOMHandler( $prev ), |
| 1305 | $node, $domHandler |
| 1306 | ); |
| 1307 | |
| 1308 | $this->env->log( 'debug/wts', 'Calling serialization handler for ' . $nodeName ); |
| 1309 | $nextNode = $method( $node, $domHandler ); |
| 1310 | |
| 1311 | $next = DiffDOMUtils::nextNonSepSibling( $node ) ?: $node->parentNode; |
| 1312 | $this->env->log( 'debug/wts', 'After constraints for ' . $nodeName ); |
| 1313 | $state->separators->updateSeparatorConstraints( |
| 1314 | $node, $domHandler, |
| 1315 | $next, $domHandlerFactory->getDOMHandler( $next ) |
| 1316 | ); |
| 1317 | |
| 1318 | // Update modification flags |
| 1319 | $state->updateModificationFlags( $node ); |
| 1320 | |
| 1321 | return $nextNode; |
| 1322 | } |
| 1323 | |
| 1324 | private function stripUnnecessaryHeadingNowikis( string $line ): string { |
| 1325 | $state = $this->state; |
| 1326 | if ( !$state->hasHeadingEscapes ) { |
| 1327 | return $line; |
| 1328 | } |
| 1329 | |
| 1330 | $escaper = static function ( string $wt ) use ( $state ) { |
| 1331 | $ret = $state->serializer->wteHandlers->escapedText( $state, false, $wt, false, true ); |
| 1332 | return $ret; |
| 1333 | }; |
| 1334 | |
| 1335 | preg_match( self::HEADING_NOWIKI_REGEXP, $line, $match ); |
| 1336 | if ( $match && !preg_match( Utils::COMMENT_OR_WS_REGEXP, $match[2] ) ) { |
| 1337 | // The nowikiing was spurious since the trailing = is not in EOL position |
| 1338 | return $escaper( $match[1] ) . $match[2]; |
| 1339 | } else { |
| 1340 | // All is good. |
| 1341 | return $line; |
| 1342 | } |
| 1343 | } |
| 1344 | |
| 1345 | private function stripUnnecessaryIndentPreNowikis(): void { |
| 1346 | // FIXME: The solTransparentWikitextRegexp includes redirects, which really |
| 1347 | // only belong at the SOF and should be unique. See the "New redirect" test. |
| 1348 | $noWikiRegexp = '@^' |
| 1349 | . PHPUtils::reStrip( $this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp(), '@' ) |
| 1350 | . '((?i:<nowiki>\s+</nowiki>))([^\n]*(?:\n|$))' . '@Dm'; |
| 1351 | $pieces = preg_split( $noWikiRegexp, $this->state->out, -1, PREG_SPLIT_DELIM_CAPTURE ); |
| 1352 | $out = $pieces[0]; |
| 1353 | for ( $i = 1; $i < count( $pieces ); $i += 4 ) { |
| 1354 | $out .= $pieces[$i]; |
| 1355 | $nowiki = $pieces[$i + 1]; |
| 1356 | $rest = $pieces[$i + 2]; |
| 1357 | // Ignore comments |
| 1358 | preg_match_all( '/<[^!][^<>]*>/', $rest, $htmlTags ); |
| 1359 | |
| 1360 | // Not required if just sol transparent wt. |
| 1361 | $reqd = !preg_match( $this->env->getSiteConfig()->solTransparentWikitextRegexp(), $rest ); |
| 1362 | |
| 1363 | if ( $reqd ) { |
| 1364 | foreach ( $htmlTags[0] as $j => $rawTagName ) { |
| 1365 | // Strip </, attributes, and > to get the tagname |
| 1366 | $tagName = preg_replace( '/<\/?|\s.*|>/', '', $rawTagName ); |
| 1367 | if ( !isset( Consts::$HTML['HTML5Tags'][$tagName] ) ) { |
| 1368 | // If we encounter any tag that is not a html5 tag, |
| 1369 | // it could be an extension tag. We could do a more complex |
| 1370 | // regexp or tokenize the string to determine if any block tags |
| 1371 | // show up outside the extension tag. But, for now, we just |
| 1372 | // conservatively bail and leave the nowiki as is. |
| 1373 | $reqd = true; |
| 1374 | break; |
| 1375 | } elseif ( TokenUtils::isWikitextBlockTag( $tagName ) ) { |
| 1376 | // FIXME: Extension tags shadowing html5 tags might not |
| 1377 | // have block semantics. |
| 1378 | // Block tags on a line suppress nowikis |
| 1379 | $reqd = false; |
| 1380 | } |
| 1381 | } |
| 1382 | } |
| 1383 | |
| 1384 | if ( !$reqd ) { |
| 1385 | $nowiki = preg_replace( '#^<nowiki>(\s+)</nowiki>#', '$1', $nowiki, 1 ); |
| 1386 | } else { |
| 1387 | $solTransparentWikitextNoWsRegexpFragment = PHPUtils::reStrip( |
| 1388 | $this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp(), '/' ); |
| 1389 | $wsReplacementRE = '/^(' . $solTransparentWikitextNoWsRegexpFragment . ')\s+/'; |
| 1390 | // Replace all leading whitespace |
| 1391 | do { |
| 1392 | $oldRest = $rest; |
| 1393 | $rest = preg_replace( $wsReplacementRE, '$1', $rest ); |
| 1394 | } while ( $rest !== $oldRest ); |
| 1395 | |
| 1396 | // Protect against sol-sensitive wikitext characters |
| 1397 | $solCharsTest = '/^' . $solTransparentWikitextNoWsRegexpFragment . '[=*#:;]/'; |
| 1398 | $nowiki = preg_replace( '#^<nowiki>(\s+)</nowiki>#', |
| 1399 | preg_match( $solCharsTest, $rest ) ? '<nowiki/>' : '', $nowiki, 1 ); |
| 1400 | } |
| 1401 | $out = $out . $nowiki . $rest . $pieces[$i + 3]; |
| 1402 | } |
| 1403 | $this->state->out = $out; |
| 1404 | } |
| 1405 | |
| 1406 | /** |
| 1407 | * This implements a heuristic to strip two common sources of <nowiki/>s. |
| 1408 | * When <i> and <b> tags are matched up properly, |
| 1409 | * - any single ' char before <i> or <b> does not need <nowiki/> protection. |
| 1410 | * - any single ' char before </i> or </b> does not need <nowiki/> protection. |
| 1411 | * @param string $line |
| 1412 | * @return string |
| 1413 | */ |
| 1414 | private function stripUnnecessaryQuoteNowikis( string $line ): string { |
| 1415 | if ( !$this->state->hasQuoteNowikis ) { |
| 1416 | return $line; |
| 1417 | } |
| 1418 | |
| 1419 | // Optimization: We are interested in <nowiki/>s before quote chars. |
| 1420 | // So, skip this if we don't have both. |
| 1421 | if ( !( preg_match( '#<nowiki\s*/>#', $line ) && preg_match( "/'/", $line ) ) ) { |
| 1422 | return $line; |
| 1423 | } |
| 1424 | |
| 1425 | // * Split out all the [[ ]] {{ }} '' ''' ''''' <..> </...> |
| 1426 | // parens in the regexp mean that the split segments will |
| 1427 | // be spliced into the result array as the odd elements. |
| 1428 | // * If we match up the tags properly and we see opening |
| 1429 | // <i> / <b> / <i><b> tags preceded by a '<nowiki/>, we |
| 1430 | // can remove all those nowikis. |
| 1431 | // Ex: '<nowiki/>''foo'' bar '<nowiki/>'''baz''' |
| 1432 | // * If we match up the tags properly and we see closing |
| 1433 | // <i> / <b> / <i><b> tags preceded by a '<nowiki/>, we |
| 1434 | // can remove all those nowikis. |
| 1435 | // Ex: ''foo'<nowiki/>'' bar '''baz'<nowiki/>''' |
| 1436 | // phpcs:ignore Generic.Files.LineLength.TooLong |
| 1437 | $p = preg_split( "#('''''|'''|''|\[\[|\]\]|\{\{|\}\}|<\w+(?:\s+[^>]*?|\s*?)/?>|</\w+\s*>)#", $line, -1, PREG_SPLIT_DELIM_CAPTURE ); |
| 1438 | |
| 1439 | // Which nowiki do we strip out? |
| 1440 | $nowikiIndex = -1; |
| 1441 | |
| 1442 | // Verify that everything else is properly paired up. |
| 1443 | $stack = []; |
| 1444 | $quotesOnStack = 0; |
| 1445 | $n = count( $p ); |
| 1446 | $nonHtmlTag = null; |
| 1447 | for ( $j = 1; $j < $n; $j += 2 ) { |
| 1448 | // For HTML tags, pull out just the tag name for clearer code below. |
| 1449 | preg_match( '#^<(/?\w+)#', $p[$j], $matches ); |
| 1450 | $tag = mb_strtolower( $matches[1] ?? $p[$j] ); |
| 1451 | $tagLen = strlen( $tag ); |
| 1452 | $selfClose = false; |
| 1453 | if ( str_ends_with( $p[$j], '/>' ) ) { |
| 1454 | $tag .= '/'; |
| 1455 | $selfClose = true; |
| 1456 | } |
| 1457 | |
| 1458 | // Ignore non-html-tag (<nowiki> OR extension tag) blocks |
| 1459 | if ( !$nonHtmlTag ) { |
| 1460 | if ( isset( $this->env->getSiteConfig()->getExtensionTagNameMap()[$tag] ) ) { |
| 1461 | $nonHtmlTag = $tag; |
| 1462 | continue; |
| 1463 | } |
| 1464 | } else { |
| 1465 | if ( $tagLen > 0 && $tag[0] === '/' && substr( $tag, 1 ) === $nonHtmlTag ) { |
| 1466 | $nonHtmlTag = null; |
| 1467 | } |
| 1468 | continue; |
| 1469 | } |
| 1470 | |
| 1471 | if ( $tag === ']]' ) { |
| 1472 | if ( array_pop( $stack ) !== '[[' ) { |
| 1473 | return $line; |
| 1474 | } |
| 1475 | } elseif ( $tag === '}}' ) { |
| 1476 | if ( array_pop( $stack ) !== '{{' ) { |
| 1477 | return $line; |
| 1478 | } |
| 1479 | } elseif ( $tagLen > 0 && $tag[0] === '/' ) { // closing html tag |
| 1480 | // match html/ext tags |
| 1481 | $openTag = array_pop( $stack ); |
| 1482 | if ( $tag !== ( '/' . $openTag ) ) { |
| 1483 | return $line; |
| 1484 | } |
| 1485 | } elseif ( $tag === 'nowiki/' ) { |
| 1486 | // We only want to process: |
| 1487 | // - trailing single quotes (bar') |
| 1488 | // - or single quotes by themselves without a preceding '' sequence |
| 1489 | if ( substr( $p[$j - 1], -1 ) === "'" |
| 1490 | && !( $p[$j - 1] === "'" && $j > 1 && substr( $p[$j - 2], -2 ) === "''" ) |
| 1491 | // Consider <b>foo<i>bar'</i>baz</b> or <b>foo'<i>bar'</i>baz</b>. |
| 1492 | // The <nowiki/> before the <i> or </i> cannot be stripped |
| 1493 | // if the <i> is embedded inside another quote. |
| 1494 | && ( $quotesOnStack === 0 |
| 1495 | // The only strippable scenario with a single quote elt on stack |
| 1496 | // is: ''bar'<nowiki/>'' |
| 1497 | // -> ["", "''", "bar'", "<nowiki/>", "", "''"] |
| 1498 | || ( $quotesOnStack === 1 |
| 1499 | && $j + 2 < $n |
| 1500 | && $p[$j + 1] === '' |
| 1501 | && $p[$j + 2][0] === "'" |
| 1502 | && $p[$j + 2] === PHPUtils::lastItem( $stack ) ) ) |
| 1503 | ) { |
| 1504 | $nowikiIndex = $j; |
| 1505 | } |
| 1506 | continue; |
| 1507 | } elseif ( $selfClose || $tag === 'br' ) { |
| 1508 | // Skip over self-closing tags or what should have been self-closed. |
| 1509 | // ( While we could do this for all void tags defined in |
| 1510 | // mediawiki.wikitext.constants.js, <br> is the most common |
| 1511 | // culprit. ) |
| 1512 | continue; |
| 1513 | } elseif ( $tagLen > 0 && $tag[0] === "'" && PHPUtils::lastItem( $stack ) === $tag ) { |
| 1514 | array_pop( $stack ); |
| 1515 | $quotesOnStack--; |
| 1516 | } else { |
| 1517 | $stack[] = $tag; |
| 1518 | if ( $tagLen > 0 && $tag[0] === "'" ) { |
| 1519 | $quotesOnStack++; |
| 1520 | } |
| 1521 | } |
| 1522 | } |
| 1523 | |
| 1524 | if ( count( $stack ) ) { |
| 1525 | return $line; |
| 1526 | } |
| 1527 | |
| 1528 | if ( $nowikiIndex !== -1 ) { |
| 1529 | // We can only remove the final trailing nowiki. |
| 1530 | // |
| 1531 | // HTML : <i>'foo'</i> |
| 1532 | // line : ''<nowiki/>'foo'<nowiki/>'' |
| 1533 | $p[$nowikiIndex] = ''; |
| 1534 | return implode( '', $p ); |
| 1535 | } else { |
| 1536 | return $line; |
| 1537 | } |
| 1538 | } |
| 1539 | |
| 1540 | /** |
| 1541 | * Serialize an HTML DOM. |
| 1542 | * |
| 1543 | * WARNING: You probably want to use WikitextContentModelHandler::fromDOM instead. |
| 1544 | * |
| 1545 | * @param Document|DocumentFragment $node |
| 1546 | * @param bool $selserMode |
| 1547 | * @return string |
| 1548 | */ |
| 1549 | public function serializeDOM( |
| 1550 | Node $node, bool $selserMode = false |
| 1551 | ): string { |
| 1552 | Assert::parameterType( |
| 1553 | [ Document::class, DocumentFragment::class ], |
| 1554 | $node, '$node' ); |
| 1555 | |
| 1556 | if ( $node instanceof Document ) { |
| 1557 | $node = DOMCompat::getBody( $node ); |
| 1558 | } |
| 1559 | |
| 1560 | $this->logType = $selserMode ? 'selser' : 'wts'; |
| 1561 | |
| 1562 | $state = $this->state; |
| 1563 | $state->initMode( $selserMode ); |
| 1564 | |
| 1565 | $domNormalizer = new DOMNormalizer( $state ); |
| 1566 | $domNormalizer->normalize( $node ); |
| 1567 | |
| 1568 | if ( $this->env->hasDumpFlag( 'dom:post-normal' ) ) { |
| 1569 | $options = [ 'storeDiffMark' => true ]; |
| 1570 | $this->env->writeDump( ContentUtils::dumpDOM( $node, 'DOM: post-normal', $options ) ); |
| 1571 | } |
| 1572 | |
| 1573 | $state->kickOffSerialize( $node ); |
| 1574 | |
| 1575 | if ( $state->hasIndentPreNowikis ) { |
| 1576 | // FIXME: Perhaps this can be done on a per-line basis |
| 1577 | // rather than do one post-pass on the entire document. |
| 1578 | $this->stripUnnecessaryIndentPreNowikis(); |
| 1579 | } |
| 1580 | |
| 1581 | $splitLines = $state->selserMode |
| 1582 | || $state->hasQuoteNowikis |
| 1583 | || $state->hasSelfClosingNowikis |
| 1584 | || $state->hasHeadingEscapes; |
| 1585 | |
| 1586 | if ( $splitLines ) { |
| 1587 | $state->out = implode( "\n", array_map( function ( $line ) { |
| 1588 | // FIXME: Perhaps this can be done on a per-line basis |
| 1589 | // rather than do one post-pass on the entire document. |
| 1590 | $line = $this->stripUnnecessaryQuoteNowikis( $line ); |
| 1591 | |
| 1592 | return $this->stripUnnecessaryHeadingNowikis( $line ); |
| 1593 | }, explode( "\n", $state->out ) ) ); |
| 1594 | } |
| 1595 | |
| 1596 | if ( $state->redirectText && $state->redirectText !== 'unbuffered' ) { |
| 1597 | $firstLine = explode( "\n", $state->out, 1 )[0]; |
| 1598 | $nl = preg_match( '/^(\s|$)/D', $firstLine ) ? '' : "\n"; |
| 1599 | $state->out = $state->redirectText . $nl . $state->out; |
| 1600 | } |
| 1601 | |
| 1602 | return $state->out; |
| 1603 | } |
| 1604 | } |