Code Coverage for /src/src/Html2Wt/LinkHandlerUtils.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	0.00% covered (danger)	0.00%	0 / 814	0.00% covered (danger)	0.00%	0 / 16	CRAP	0.00% covered (danger)	0.00%	0 / 1
LinkHandlerUtils	0.00% covered (danger)	0.00%	0 / 814	0.00% covered (danger)	0.00%	0 / 16	86142	0.00% covered (danger)	0.00%	0 / 1
splitLinkContentString	0.00% covered (danger)	0.00%	0 / 15	0.00% covered (danger)	0.00%	0 / 1	30
getHref	0.00% covered (danger)	0.00%	0 / 10	0.00% covered (danger)	0.00%	0 / 1	56
normalizeIWP	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
escapeLinkTarget	0.00% covered (danger)	0.00%	0 / 6	0.00% covered (danger)	0.00%	0 / 1	6
getContentString	0.00% covered (danger)	0.00%	0 / 13	0.00% covered (danger)	0.00%	0 / 1	42
getLinkRoundTripData	0.00% covered (danger)	0.00%	0 / 122	0.00% covered (danger)	0.00%	0 / 1	2756
escapeExtLinkURL	0.00% covered (danger)	0.00%	0 / 12	0.00% covered (danger)	0.00%	0 / 1	2
addColonEscape	0.00% covered (danger)	0.00%	0 / 8	0.00% covered (danger)	0.00%	0 / 1	30
isURLLink	0.00% covered (danger)	0.00%	0 / 7	0.00% covered (danger)	0.00%	0 / 1	56
hasAutoUrlTerminatingChars	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	2
isSimpleWikiLink	0.00% covered (danger)	0.00%	0 / 49	0.00% covered (danger)	0.00%	0 / 1	306
serializeAsWikiLink	0.00% covered (danger)	0.00%	0 / 125	0.00% covered (danger)	0.00%	0 / 1	2352
serializeAsExtLink	0.00% covered (danger)	0.00%	0 / 38	0.00% covered (danger)	0.00%	0 / 1	132
linkHandler	0.00% covered (danger)	0.00%	0 / 55	0.00% covered (danger)	0.00%	0 / 1	240
figureHandler	0.00% covered (danger)	0.00%	0 / 9	0.00% covered (danger)	0.00%	0 / 1	6
figureToConstrainedText	0.00% covered (danger)	0.00%	0 / 342	0.00% covered (danger)	0.00%	0 / 1	12882

1	<?php
2	declare( strict_types = 1 );
3
4	namespace Wikimedia\Parsoid\Html2Wt;
5
6	use stdClass;
7	use UnexpectedValueException;
8	use Wikimedia\Parsoid\Config\Env;
9	use Wikimedia\Parsoid\Core\MediaStructure;
10	use Wikimedia\Parsoid\DOM\Element;
11	use Wikimedia\Parsoid\DOM\Node;
12	use Wikimedia\Parsoid\DOM\Text;
13	use Wikimedia\Parsoid\Html2Wt\ConstrainedText\AutoURLLinkText;
14	use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ConstrainedText;
15	use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ExtLinkText;
16	use Wikimedia\Parsoid\Html2Wt\ConstrainedText\MagicLinkText;
17	use Wikimedia\Parsoid\Html2Wt\ConstrainedText\WikiLinkText;
18	use Wikimedia\Parsoid\NodeData\DataParsoid;
19	use Wikimedia\Parsoid\NodeData\TempData;
20	use Wikimedia\Parsoid\Utils\ContentUtils;
21	use Wikimedia\Parsoid\Utils\DOMCompat;
22	use Wikimedia\Parsoid\Utils\DOMDataUtils;
23	use Wikimedia\Parsoid\Utils\DOMUtils;
24	use Wikimedia\Parsoid\Utils\PHPUtils;
25	use Wikimedia\Parsoid\Utils\TokenUtils;
26	use Wikimedia\Parsoid\Utils\UrlUtils;
27	use Wikimedia\Parsoid\Utils\Utils;
28	use Wikimedia\Parsoid\Utils\WTUtils;
29	use Wikimedia\Parsoid\Wt2Html\TokenizerUtils;
30
31	/**
32	* Serializes link markup.
33	*/
34	class LinkHandlerUtils {
35	private static $REDIRECT_TEST_RE = '/^([ \t\n\r\0\x0b])*$/D';
36	private static $MW_TITLE_WHITESPACE_RE
37	= '/[ _\xA0\x{1680}\x{180E}\x{2000}-\x{200A}\x{2028}\x{2029}\x{202F}\x{205F}\x{3000}]+/u';
38
39	/**
40	* Split a string based on a prefix and suffix
41	*
42	* @param string $contentString
43	* @param DataParsoid $dp Containing ->prefix and ->tail
44	* @return stdClass
45	*/
46	private static function splitLinkContentString( string $contentString, DataParsoid $dp ): stdClass {
47	$tail = $dp->tail ?? '';
48	$prefix = $dp->prefix ?? '';
49
50	$tailLen = strlen( $tail );
51	if ( $tailLen && substr( $contentString, -$tailLen ) === $tail ) {
52	// strip the tail off the content
53	$contentString = substr( $contentString, 0, -$tailLen );
54	} else {
55	$tail = '';
56	}
57
58	$prefixLen = strlen( $prefix );
59	if ( $prefixLen && substr( $contentString, 0, $prefixLen ) === $prefix ) {
60	$contentString = substr( $contentString, $prefixLen );
61	} else {
62	$prefix = '';
63	}
64
65	return (object)[
66	'contentString' => $contentString,
67	'tail' => $tail,
68	'prefix' => $prefix,
69	];
70	}
71
72	/**
73	* Helper function for munging protocol-less absolute URLs:
74	* If this URL is absolute, but doesn't contain a protocol,
75	* try to find a localinterwiki protocol that would work.
76	*
77	* @param Env $env
78	* @param Element $node
79	* @return string
80	*/
81	private static function getHref( Env $env, Element $node ): string {
82	$href = DOMCompat::getAttribute( $node, 'href' ) ?? '';
83	if ( ( $href[0] ?? '' ) === '/' && ( $href[1] ?? '' ) !== '/' ) {
84	// protocol-less but absolute. let's find a base href
85	foreach ( $env->getSiteConfig()->interwikiMapNoNamespaces() as $prefix => $interwikiInfo ) {
86	if ( isset( $interwikiInfo['localinterwiki'] ) && isset( $interwikiInfo['url'] ) ) {
87	$base = $interwikiInfo['url'];
88
89	// evaluate the url relative to this base
90	$nhref = UrlUtils::expandUrl( $href, $base );
91
92	// can this match the pattern?
93	$re = '/^' . strtr( preg_quote( $base, '/' ), [ '\\$1' => '.*' ] ) . '$/sD';
94	if ( preg_match( $re, $nhref ) ) {
95	return $nhref;
96	}
97	}
98	}
99	}
100	return $href;
101	}
102
103	/**
104	* Normalize an interwiki prefix (?)
105	* @param string $str
106	* @return string
107	*/
108	private static function normalizeIWP( string $str ): string {
109	return PHPUtils::stripPrefix( trim( strtolower( $str ) ), ':' );
110	}
111
112	/**
113	* Escape a link target, and indicate if it's valid
114	* @param string $linkTarget
115	* @param SerializerState $state
116	* @return stdClass
117	*/
118	private static function escapeLinkTarget( string $linkTarget, SerializerState $state ): stdClass {
119	// Entity-escape the content.
120	$linkTarget = Utils::escapeWtEntities( $linkTarget );
121	return (object)[
122	'linkTarget' => $linkTarget,
123	// Is this an invalid link?
124	'invalidLink' => !$state->getEnv()->isValidLinkTarget( $linkTarget ) \|\|
125	// `isValidLinkTarget` omits fragments (the part after #) so,
126	// even though "\|" is an invalid character, we still need to ensure
127	// it doesn't appear in there. The percent encoded version is fine
128	// in the fragment, since it won't break the parse.
129	strpos( $linkTarget, '\|' ) !== false,
130	];
131	}
132
133	/**
134	* Get the plain text content of the node, if it can be represented as such
135	*
136	* NOTE: This function seems a little inconsistent about what's considered
137	* null and what's an empty string. For example, no children is null
138	* but a single diffMarker gets a string? One of the current callers
139	* seems to subtly depend on that though.
140	*
141	* FIXME(T254501): This function can return `$node->textContent` instead
142	* of the string concatenation once mw:DisplaySpace is preprocessed away.
143	*
144	* @param Node $node
145	* @return ?string
146	*/
147	private static function getContentString( Node $node ): ?string {
148	if ( !$node->hasChildNodes() ) {
149	return null;
150	}
151	$contentString = '';
152	$child = $node->firstChild;
153	while ( $child ) {
154	if ( $child instanceof Text ) {
155	$contentString .= $child->nodeValue;
156	} elseif ( DOMUtils::hasTypeOf( $child, 'mw:DisplaySpace' ) ) {
157	$contentString .= ' ';
158	} elseif ( DiffUtils::isDiffMarker( $child ) ) {
159	} else {
160	return null;
161	}
162	$child = $child->nextSibling;
163	}
164	return $contentString;
165	}
166
167	/**
168	* Helper function for getting RT data from the tokens
169	* @param Env $env
170	* @param Element $node
171	* @param SerializerState $state
172	* @return stdClass
173	*/
174	private static function getLinkRoundTripData(
175	Env $env, Element $node, SerializerState $state
176	): stdClass {
177	$dp = DOMDataUtils::getDataParsoid( $node );
178	$siteConfig = $env->getSiteConfig();
179	$rtData = (object)[
180	'type' => null, // could be null
181	'href' => null, // filled in below
182	'origHref' => null, // filled in below
183	'target' => null, // filled in below
184	'tail' => $dp->tail ?? '',
185	'prefix' => $dp->prefix ?? '',
186	'linkType' => null
187	];
188	$rtData->content = new stdClass;
189
190	// Figure out the type of the link
191	if ( $node->hasAttribute( 'rel' ) ) {
192	$rel = DOMCompat::getAttribute( $node, 'rel' ) ?? '';
193	// Parsoid only emits and recognizes ExtLink, WikiLink, and PageProp rel values.
194	// Everything else defaults to ExtLink during serialization (unless it is
195	// serializable to a wikilink)
196	// We're keeping the preg_match here instead of going through DOMUtils::matchRel
197	// because we have \b guards to handle the multivalue, and we're keeping the matches,
198	// which matchRel doesn't do.
199	if ( preg_match( '/\b(mw:(WikiLink\|ExtLink\|MediaLink\|PageProp)\S*)\b/', $rel, $typeMatch ) ) {
200	$rtData->type = $typeMatch[1];
201	// Strip link subtype info
202	if ( $typeMatch[2] === 'WikiLink' \|\| $typeMatch[2] === 'ExtLink' ) {
203	$rtData->type = 'mw:' . $typeMatch[2];
204	}
205	}
206	}
207
208	// Default link type if nothing else is set
209	if ( $rtData->type === null && !DOMUtils::selectMediaElt( $node ) ) {
210	$rtData->type = 'mw:ExtLink';
211	}
212
213	// Get href, and save the token's "real" href for comparison
214	$href = self::getHref( $env, $node );
215	$rtData->origHref = $href;
216	$rtData->href = preg_replace( '#^(\.\.?/)+#', '', $href, 1 );
217
218	// WikiLinks should be relative (but see below); fixup the link type
219	// if a WikiLink has an absolute URL.
220	// (This may get converted back to a WikiLink below, in the interwiki
221	// handling code.)
222	if ( $rtData->type === 'mw:WikiLink' &&
223	( preg_match( '#^(\w+:)?//#', $rtData->href ) \|\|
224	substr( $rtData->origHref ?? '', 0, 1 ) === '/' )
225	) {
226	$rtData->type = 'mw:ExtLink';
227	}
228
229	// Now get the target from rt data
230	$rtData->target = $state->serializer->serializedAttrVal( $node, 'href' );
231
232	// Check if the link content has been modified or is newly inserted content.
233	// FIXME: This will only work with selser of course. Hard to test without selser.
234	if (
235	$state->inInsertedContent \|\|
236	DiffUtils::hasDiffMark( $node, DiffMarkers::SUBTREE_CHANGED )
237	) {
238	$rtData->contentModified = true;
239	}
240
241	// Get the content string or tokens
242	$contentString = self::getContentString( $node );
243	if ( $contentString !== null ) {
244	if ( !empty( $rtData->target['value'] ) && $rtData->target['value'] !== $contentString ) {
245	// Try to identify a new potential tail
246	$contentParts = self::splitLinkContentString( $contentString, $dp );
247	$rtData->content->string = $contentParts->contentString;
248	$rtData->tail = $contentParts->tail;
249	$rtData->prefix = $contentParts->prefix;
250	} else {
251	$rtData->tail = '';
252	$rtData->prefix = '';
253	$rtData->content->string = $contentString;
254	}
255	} elseif ( $node->hasChildNodes() ) {
256	$rtData->contentNode = $node;
257	} elseif ( $rtData->type === 'mw:PageProp/redirect' ) {
258	$rtData->isRedirect = true;
259	$rtData->prefix = $dp->src
260	?? ( ( $siteConfig->mwAliases()['redirect'][0] ?? '#REDIRECT' ) . ' ' );
261	}
262
263	// Update link type based on additional analysis.
264	// What might look like external links might be serializable as a wikilink.
265	$target = &$rtData->target;
266
267	// mw:MediaLink annotations are considered authoritative
268	// and interwiki link matches aren't made for these
269	if ( $rtData->type === 'mw:MediaLink' ) {
270	// Parse title from resource attribute (see analog in image handling)
271	$resource = $state->serializer->serializedAttrVal( $node, 'resource' );
272	if ( $resource['value'] === null ) {
273	// from non-parsoid HTML: try to reconstruct resource from href?
274	// (See similar code which tries to guess resource from <img src>)
275	$mediaPrefix = $siteConfig->namespaceName( $siteConfig->namespaceId( 'media' ) );
276	$slashPos = strrpos( $rtData->origHref, '/' );
277	$fileName = $slashPos === false ? $rtData->origHref :
278	substr( $rtData->origHref, $slashPos + 1 );
279	$resource = [
280	'value' => $mediaPrefix . ':' . $fileName,
281	'fromsrc' => false,
282	'modified' => false
283	];
284	}
285	$rtData->target = $resource;
286	$rtData->href = preg_replace( '#^(\.\.?/)+#', '', $rtData->target['value'], 1 );
287	return $rtData;
288	}
289
290	// Check if the href matches any of our interwiki URL patterns
291	$interwikiMatch = $siteConfig->interwikiMatcher( $href );
292	if ( !$interwikiMatch ) {
293	return $rtData;
294	}
295
296	$iw = $siteConfig->interwikiMapNoNamespaces()[ltrim( $interwikiMatch[0], ':' )];
297	$localInterwiki = !empty( $iw['local'] );
298
299	// Only to be used in question mark check, since other checks want to include the fragment
300	$targetForQmarkCheck = $interwikiMatch[1];
301	// FIXME: If ever the default value for $wgExternalInterwikiFragmentMode
302	// changes, we can reduce this by always stripping off the fragment
303	// identifier, since in "html5" mode, that isn't encoded. At present,
304	// we can only do that if we know it's a local interwiki link.
305	if ( $localInterwiki ) {
306	$withoutFragment = strstr( $targetForQmarkCheck, '#', true );
307	if ( $withoutFragment !== false ) {
308	$targetForQmarkCheck = $withoutFragment;
309	}
310	}
311
312	if (
313	// Question mark is a valid title char, so it won't fail the test below,
314	// but gets percent encoded on the way out since it has special
315	// semantics in a url. That will break the url we're serializing, so
316	// protect it.
317	strpos( $targetForQmarkCheck, '?' ) === false &&
318	// Ensure we have a valid link target, otherwise falling back to extlink
319	// is preferable, since it won't serialize as a link.
320	(
321	$interwikiMatch[1] === '' \|\| !self::escapeLinkTarget(
322	// Append the prefix since we want to validate the target
323	// with respect to it being an interwiki.
324	$interwikiMatch[0] . ':' . $interwikiMatch[1],
325	$state
326	)->invalidLink
327	) &&
328	// ExtLinks should have content to convert.
329	(
330	$rtData->type !== 'mw:ExtLink' \|\|
331	!empty( $rtData->content->string ) \|\|
332	!empty( $rtData->contentNode )
333	) &&
334	( !empty( $dp->isIW ) \|\| !empty( $target['modified'] ) \|\| !empty( $rtData->contentModified ) )
335	) {
336	// External link that is really an interwiki link. Convert it.
337	// TODO: Leaving this for backwards compatibility, remove when 1.5 is no longer bound
338	if ( $rtData->type === 'mw:ExtLink' ) {
339	$rtData->type = 'mw:WikiLink';
340	}
341	$rtData->isInterwiki = true;
342	$iwMap = $siteConfig->interwikiMapNoNamespaces();
343	// could this be confused with a language link?
344	$iwi = $iwMap[self::normalizeIWP( $interwikiMatch[0] )] ?? null;
345	$rtData->isInterwikiLang = $iwi && isset( $iwi['language'] );
346	// is this our own wiki?
347	$rtData->isLocal = $iwi && isset( $iwi['localinterwiki'] );
348	// strip off localinterwiki prefixes
349	$localPrefix = '';
350	$oldPrefix = null;
351	while ( true ) {
352	$tmp = substr( $target['value'], strlen( $localPrefix ) );
353	if ( !preg_match( '/^(:?([^:]+)):/', $tmp, $oldPrefix ) ) {
354	break;
355	}
356	$iwi = $iwMap[Utils::normalizeNamespaceName( $oldPrefix[2] )] ?? null;
357	if ( !$iwi \|\| !isset( $iwi['localinterwiki'] ) ) {
358	break;
359	}
360	$localPrefix .= $oldPrefix[1] . ':';
361	}
362
363	if ( !empty( $target['fromsrc'] ) && empty( $target['modified'] ) ) {
364	// Leave the target alone!
365	} else {
366	if ( $rtData->type === 'mw:PageProp/Language' ) {
367	$targetValue = implode( ':', $interwikiMatch );
368	// Strip initial colon
369	if ( $targetValue[0] === ':' ) {
370	$targetValue = substr( $targetValue, 1 );
371	}
372	$target['value'] = $targetValue;
373	} elseif (
374	$oldPrefix && ( // Should we preserve the old prefix?
375	strcasecmp( $oldPrefix[1], $interwikiMatch[0] ) === 0 \|\|
376	// Check if the old prefix mapped to the same URL as
377	// the new one. Use the old one if that's the case.
378	// Example: [[w:Foo]] vs. [[:en:Foo]]
379	( $iwMap[self::normalizeIWP( $oldPrefix[1] )]['url'] ?? null )
380	=== ( $iwMap[self::normalizeIWP( $interwikiMatch[0] )]['url'] ?? null )
381	)
382	) {
383	// Reuse old prefix capitalization
384	if ( Utils::decodeWtEntities( substr( $target['value'], strlen( $oldPrefix[1] ) + 1 ) )
385	!== $interwikiMatch[1]
386	) {
387	// Modified, update target.value.
388	$target['value'] = $localPrefix . $oldPrefix[1] . ':' . $interwikiMatch[1];
389	}
390	// Ensure that we generate an interwiki link and not a language link!
391	if ( $rtData->isInterwikiLang && $target['value'][0] !== ':' ) {
392	$target['value'] = ':' . $target['value'];
393	}
394	} else { // Else: preserve old encoding
395	if ( !empty( $rtData->isLocal ) ) {
396	// - interwikiMatch[0] will be something like ":en" or "w"
397	// - This tests whether the interwiki-like link is actually
398	// a local wikilink.
399
400	$target['value'] = $interwikiMatch[1];
401	// interwikiMatch[1] may start with a language link prefix,
402	// ensure that we generate interwiki link syntax in that case. (T292022)
403	if (
404	preg_match( '/^([^:]+):/', $target['value'], $match ) &&
405	!empty( $iwMap[self::normalizeIWP( $match[1] )]['language'] )
406	) {
407	$target['value'] = ':' . $target['value'];
408	}
409
410	$rtData->isInterwiki = $rtData->isInterwikiLang = false;
411	} else {
412	$target['value'] = implode( ':', $interwikiMatch );
413	}
414	}
415	}
416	}
417
418	return $rtData;
419	}
420
421	/**
422	* The provided URL is already percent-encoded -- but it may still
423	* not be safe for wikitext. Add additional escapes to make the URL
424	* wikitext-safe. Don't touch percent escapes already in the url,
425	* though!
426	* @param string $urlStr
427	* @return string
428	*/
429	private static function escapeExtLinkURL( string $urlStr ): string {
430	// this regexp is the negation of EXT_LINK_URL_CLASS in the PHP parser
431	return preg_replace(
432	// IPv6 host names are bracketed with []. Entity-decode these.
433	'!^([a-z][^:/]*:)?//[([0-9a-f:.]+)](:\d\|/\|$)!iD',
434	'$1//[$2]$3',
435	preg_replace_callback(
436	// phpcs:ignore Generic.Files.LineLength.TooLong
437	'/[\]\[<>"\x00-\x20\x7F\x{A0}\x{1680}\x{180E}\x{2000}-\x{200A}\x{202F}\x{205F}\x{3000}]\|-(?=\{)/u',
438	static function ( $m ) {
439	return Utils::entityEncodeAll( $m[0] );
440	},
441	$urlStr
442	),
443	1
444	);
445	}
446
447	/**
448	* Add a colon escape to a wikilink target string if needed.
449	* @param Env $env
450	* @param string $linkTarget
451	* @param stdClass $linkData
452	* @return string
453	*/
454	private static function addColonEscape(
455	Env $env, string $linkTarget, stdClass $linkData
456	): string {
457	$linkTitle = $env->makeTitleFromText( $linkTarget );
458	$categoryNs = $env->getSiteConfig()->canonicalNamespaceId( 'category' );
459	$fileNs = $env->getSiteConfig()->canonicalNamespaceId( 'file' );
460
461	if ( ( $linkTitle->getNamespace() === $categoryNs \|\| $linkTitle->getNamespace() === $fileNs ) &&
462	$linkData->type === 'mw:WikiLink' &&
463	$linkTarget[0] !== ':' ) {
464	// Escape category and file links
465	return ':' . $linkTarget;
466	} else {
467	return $linkTarget;
468	}
469	}
470
471	/**
472	* Test if something is a URL link
473	* @param Env $env
474	* @param Element $node
475	* @param stdClass $linkData
476	* @return bool
477	*/
478	private static function isURLLink( Env $env, Element $node, stdClass $linkData ): bool {
479	$target = $linkData->target;
480
481	// Get plain text content, if any
482	$contentStr = self::getContentString( $node );
483
484	// First check if we can serialize as an URL link
485	return ( $contentStr !== null && $contentStr !== '' ) &&
486	// Can we minimize this?
487	( $target['value'] === $contentStr \|\| self::getHref( $env, $node ) === $contentStr ) &&
488	// protocol-relative url links not allowed in text
489	// (see autourl rule in peg tokenizer, T32269)
490	!str_starts_with( $contentStr, '//' ) && Utils::isProtocolValid( $contentStr, $env ) &&
491	!self::hasAutoUrlTerminatingChars( $contentStr );
492	}
493
494	/**
495	* The legacy parser Parser.php::makeFreeExternalLink terminates an autourl when encountering
496	* some characters; since we wish to mimic that behaviour we need this method to check whether
497	* the provided URL is in that case.
498	* @param string $url
499	* @return bool
500	*/
501	private static function hasAutoUrlTerminatingChars( string $url ): bool {
502	$sep = TokenizerUtils::getAutoUrlTerminatingChars( strpos( $url, '(' ) !== false );
503	return str_contains( $sep, substr( $url, -1 ) );
504	}
505
506	/**
507	* Figure out if we need a piped or simple link
508	* @param Env $env
509	* @param DataParsoid $dp
510	* @param array $target
511	* @param stdClass $linkData
512	* @return bool
513	*/
514	private static function isSimpleWikiLink(
515	Env $env, DataParsoid $dp, array $target, stdClass $linkData
516	): bool {
517	$canUseSimple = false;
518	$contentString = $linkData->content->string ?? null;
519
520	// FIXME (SSS):
521	// 1. Revisit this logic to see if all these checks
522	// are still relevant or whether this can be simplified somehow.
523	// 2. There are also duplicate computations for env.normalizedTitleKey(..)
524	// and Util.decodeURIComponent(..) that could be removed.
525	// 3. This could potentially be refactored as if-then chains.
526
527	// Would need to pipe for any non-string content.
528	// Preserve unmodified or non-minimal piped links.
529	if ( $contentString !== null &&
530	( !empty( $target['modified'] ) \|\| !empty( $linkData->contentModified ) \|\|
531	( $dp->stx ?? null ) !== 'piped'
532	) &&
533	// Relative links are not simple
534	!str_starts_with( $contentString, './' )
535	) {
536	// Strip colon escapes from the original target as that is
537	// stripped when deriving the content string.
538	// Strip ./ prefixes as well since they are relative link prefixes
539	// added to all titles.
540	// The prefix stripping, when it occurs, also includes spaces before the prefix.
541	// Finally, we also remove trailing spaces because these are removed for <a> links
542	// by DOMNormalizer::moveTrailingSpacesOut, and we wouldn't want that to lead to the
543	// link getting piped for only that reason.
544	$strippedTargetValue = rtrim(
545	preg_replace( '#^\s*(:\|\./)#', '', $target['value'], 1 )
546	);
547
548	// Strip colon escape after prefix for interwikis
549	if ( !empty( $linkData->isInterwiki ) ) {
550	$strippedTargetValue = preg_replace( '#^(\w+:):#', '$1', $strippedTargetValue, 1 );
551	}
552
553	$decodedTarget = Utils::decodeWtEntities( $strippedTargetValue );
554	// Deal with the protocol-relative link scenario as well
555	$hrefHasProto = preg_match( '#^(\w+:)?//#', $linkData->href );
556
557	// Normalize content string and decoded target before comparison.
558	// Piped links don't come down this path => it is safe to normalize both.
559	$contentString = str_replace( '_', ' ', $contentString );
560	$decodedTarget = str_replace( '_', ' ', $decodedTarget );
561
562	// See if the (normalized) content matches the
563	// target, either shadowed or actual.
564	$canUseSimple =
565	$contentString === $decodedTarget \|\|
566	// try wrapped in forward slashes in case they were stripped
567	( '/' . $contentString . '/' ) === $decodedTarget \|\|
568	// normalize as titles and compare
569	// FIXME: This will strip an interwiki prefix. Is that right?
570	$env->normalizedTitleKey( $contentString, true )
571	=== preg_replace( self::$MW_TITLE_WHITESPACE_RE, '_', $decodedTarget ) \|\|
572	// Relative link
573	(
574	(
575	$env->getSiteConfig()->namespaceHasSubpages(
576	$env->getContextTitle()->getNamespace()
577	) &&
578	preg_match( '#^\.\./.*[^/]$#D', $strippedTargetValue ) &&
579	$contentString === $env->resolveTitle( $strippedTargetValue )
580	) \|\|
581	(
582	preg_match( '#^\.\./.*?/$#D', $strippedTargetValue ) &&
583	$contentString === preg_replace( '#^(?:\.\./)+(.*?)/$#D', '$1', $strippedTargetValue, 1 )
584	)
585	) \|\|
586	// if content == href this could be a simple link... eg [[Foo]].
587	// but if href is an absolute url with protocol, this won't
588	// work: [[http://example.com]] is not a valid simple link!
589	(
590	!$hrefHasProto &&
591	// Always compare against decoded uri because
592	// <a rel="mw:WikiLink" href="7%25 Solution">7%25 Solution</a></p>
593	// should serialize as [[7% Solution\|7%25 Solution]]
594	(
595	$contentString === Utils::decodeURIComponent( $linkData->href ) \|\|
596	// normalize with underscores for comparison with href
597	$env->normalizedTitleKey( $contentString, true )
598	=== Utils::decodeURIComponent( $linkData->href )
599	)
600	);
601	}
602
603	return $canUseSimple;
604	}
605
606	/**
607	* Serialize as wiki link
608	* @param Element $node
609	* @param SerializerState $state
610	* @param stdClass $linkData
611	*/
612	private static function serializeAsWikiLink(
613	Element $node, SerializerState $state, stdClass $linkData
614	): void {
615	$contentParts = null;
616	$contentSrc = '';
617	$isPiped = false;
618	$needsEscaping = true;
619	$env = $state->getEnv();
620	$siteConfig = $env->getSiteConfig();
621	$target = $linkData->target;
622	$dp = DOMDataUtils::getDataParsoid( $node );
623
624	// Decode any link that did not come from the source (data-mw/parsoid)
625	// Links that come from data-mw/data-parsoid will be true titles,
626	// but links that come from hrefs will need to be url-decoded.
627	// Ex: <a href="/wiki/A%3Fb">Foobar</a>
628	if ( empty( $target['fromsrc'] ) ) {
629	// Omit fragments from decoding
630	$hash = strpos( $target['value'], '#' );
631	if ( $hash !== false ) {
632	$target['value'] = Utils::decodeURIComponent( substr( $target['value'], 0, $hash ) )
633	. substr( $target['value'], $hash );
634	} else {
635	$target['value'] = Utils::decodeURIComponent( $target['value'] );
636	}
637	}
638
639	// Special-case handling for category links
640	if ( $linkData->type === 'mw:PageProp/Category' ) {
641	// Split target and sort key in $target['value'].
642	// The sort key shows up as "#something" in there.
643	// However, watch out for parser functions that start with "{{#"
644	// The atomic group is essential to prevent "{{#" parser function prefix
645	// from getting split at the "{{" and "#" where the "{{" matches the
646	// [^#]* and the "#" matches after separately.
647	if ( preg_match( '/^((?>{{#\|[^#]))#(.)/', $target['value'], $targetParts ) ) {
648	$target['value'] = strtr( preg_replace( '#^(\.\.?/)*#', '', $targetParts[1], 1 ), '_', ' ' );
649	// FIXME: Reverse `Sanitizer.sanitizeTitleURI(strContent).replace(/#/g, '%23');`
650	$strContent = Utils::decodeURIComponent( $targetParts[2] );
651	$contentParts = self::splitLinkContentString( $strContent, $dp );
652	$linkData->content->string = $contentParts->contentString;
653	$dp->tail = $linkData->tail = $contentParts->tail;
654	$dp->prefix = $linkData->prefix = $contentParts->prefix;
655	} else { // No sort key, will serialize to simple link
656	// Normalize the content string
657	$linkData->content->string = strtr(
658	PHPUtils::stripPrefix( $target['value'], './' ), '_', ' '
659	);
660	}
661
662	// Special-case handling for template-affected sort keys
663	// FIXME: sort keys cannot be modified yet, but if they are,
664	// we need to fully shadow the sort key.
665	// if ( !target.modified ) {
666	// The target and source key was not modified
667	$sortKeySrc = $state->serializer->serializedAttrVal( $node, 'mw:sortKey' );
668	if ( isset( $sortKeySrc['value'] ) ) {
669	$linkData->contentNode = null;
670	$linkData->content->string = $sortKeySrc['value'];
671	// TODO: generalize this flag. It is already used by
672	// getAttributeShadowInfo. Maybe use the same
673	// structure as its return value?
674	$linkData->content->fromsrc = true;
675	}
676	// }
677	} else {
678	if ( $linkData->type === 'mw:PageProp/Language' ) {
679	// Fix up the content string
680	// TODO: see if linkData can be cleaner!
681	$linkData->content->string ??= Utils::decodeWtEntities( $target['value'] );
682	}
683	}
684
685	// The string value of the content, if it is plain text.
686	$linkTarget = null;
687	$escapedTgt = null;
688	if ( !empty( $linkData->isRedirect ) ) {
689	$linkTarget = $target['value'];
690	if ( !empty( $target['modified'] ) \|\| empty( $target['fromsrc'] ) ) {
691	$linkTarget = strtr( preg_replace( '#^(\.\.?/)*#', '', $linkTarget, 1 ), '_', ' ' );
692	$escapedTgt = self::escapeLinkTarget( $linkTarget, $state );
693	$linkTarget = $escapedTgt->linkTarget;
694	// Determine if it's a redirect to a category, in which case
695	// it needs a ':' on front to distingish from a category link.
696	if ( preg_match( '/^([^:]+)[:]/', $linkTarget, $categoryMatch ) ) {
697	$ns = $siteConfig->namespaceId( Utils::normalizeNamespaceName( $categoryMatch[1] ) );
698	if ( $ns === $siteConfig->canonicalNamespaceId( 'category' ) ) {
699	// Check that the next node isn't a category link,
700	// in which case we don't want the ':'.
701	$nextNode = $node->nextSibling;
702	if ( !(
703	$nextNode instanceof Element && DOMCompat::nodeName( $nextNode ) === 'link' &&
704	DOMUtils::hasRel( $nextNode, 'mw:PageProp/Category' ) &&
705	DOMCompat::getAttribute( $nextNode, 'href' ) === DOMCompat::getAttribute( $node, 'href' )
706	) ) {
707	$linkTarget = ':' . $linkTarget;
708	}
709	}
710	}
711	}
712	} elseif ( self::isSimpleWikiLink( $env, $dp, $target, $linkData ) ) {
713	// Simple case
714	if ( empty( $target['modified'] ) && empty( $linkData->contentModified ) ) {
715	$linkTarget = PHPUtils::stripPrefix( $target['value'], './' );
716	} else {
717	// If token has templated attrs or is a subpage, use target.value
718	// since content string will be drastically different.
719	if ( WTUtils::hasExpandedAttrsType( $node ) \|\|
720	preg_match( '#(^\|/)\.\./#', $target['value'] )
721	) {
722	$linkTarget = PHPUtils::stripPrefix( $target['value'], './' );
723	} else {
724	$escapedTgt = self::escapeLinkTarget( $linkData->content->string, $state );
725	if ( !$escapedTgt->invalidLink ) {
726	$linkTarget = self::addColonEscape( $env, $escapedTgt->linkTarget, $linkData );
727	} else {
728	$linkTarget = $escapedTgt->linkTarget;
729	}
730	}
731	if ( !empty( $linkData->isInterwikiLang ) &&
732	$linkTarget[0] !== ':' &&
733	$linkData->type !== 'mw:PageProp/Language'
734	) {
735	// ensure interwiki links can't be confused with
736	// interlanguage links.
737	$linkTarget = ':' . $linkTarget;
738	}
739	}
740	} elseif ( self::isURLLink( $state->getEnv(), $node, $linkData )
741	/* && empty( $linkData->isInterwiki ) */
742	) {
743	// Uncomment the above check if we want [[wikipedia:Foo\|http://en.wikipedia.org/wiki/Foo]]
744	// for '<a href="http://en.wikipedia.org/wiki/Foo">http://en.wikipedia.org/wiki/Foo</a>'
745	$linkData->linkType = 'mw:URLLink';
746	} else {
747	// Emit piped wikilink syntax
748	$isPiped = true;
749
750	// First get the content source
751	if ( !empty( $linkData->contentNode ) ) {
752	$cs = $state->serializeLinkChildrenToString(
753	$linkData->contentNode,
754	[ $state->serializer->wteHandlers, 'wikilinkHandler' ]
755	);
756	// strip off the tail and handle the pipe trick
757	$contentParts = self::splitLinkContentString( $cs, $dp );
758	$contentSrc = $contentParts->contentString;
759	$dp->tail = $contentParts->tail;
760	$linkData->tail = $contentParts->tail;
761	$dp->prefix = $contentParts->prefix;
762	$linkData->prefix = $contentParts->prefix;
763	$needsEscaping = false;
764	} else {
765	$contentSrc = $linkData->content->string ?? '';
766	$needsEscaping = empty( $linkData->content->fromsrc );
767	}
768
769	if ( $contentSrc === '' && $linkData->type !== 'mw:PageProp/Category' ) {
770	// Protect empty link content from PST pipe trick
771	$contentSrc = '<nowiki/>';
772	$needsEscaping = false;
773	}
774
775	$linkTarget = $target['value'];
776	if ( !empty( $target['modified'] ) \|\| empty( $target['fromsrc'] ) ) {
777	// Links starting with ./ shouldn't get _ replaced with ' '
778	$linkContentIsRelative = str_starts_with( $linkData->content->string ?? '', './' );
779	$linkTarget = preg_replace( '#^(\.\.?/)*#', '', $linkTarget, 1 );
780	if ( empty( $linkData->isInterwiki ) && !$linkContentIsRelative ) {
781	$linkTarget = strtr( $linkTarget, '_', ' ' );
782	}
783	$escapedTgt = self::escapeLinkTarget( $linkTarget, $state );
784	$linkTarget = $escapedTgt->linkTarget;
785	}
786
787	// If we are reusing the target from source, we don't
788	// need to worry about colon-escaping because it will
789	// be in the right form already.
790	//
791	// Trying to eliminate this check and always check for
792	// colon-escaping seems a bit tricky when the reused
793	// target has encoded entities that won't resolve to
794	// valid titles.
795	if ( ( !$escapedTgt \|\| !$escapedTgt->invalidLink ) && empty( $target['fromsrc'] ) ) {
796	$linkTarget = self::addColonEscape( $env, $linkTarget, $linkData );
797	}
798	}
799	if ( $linkData->linkType === 'mw:URLLink' ) {
800	$state->emitChunk( new AutoURLLinkText( $node->textContent, $node ), $node );
801	return;
802	}
803
804	if ( !empty( $linkData->isRedirect ) ) {
805	// Drop duplicates
806	if ( $state->redirectText !== null ) {
807	return;
808	}
809
810	// Buffer redirect text if it is not in start of file position
811	if ( !preg_match( self::$REDIRECT_TEST_RE, $state->out . $state->currLine->text ) ) {
812	$state->redirectText = $linkData->prefix . '[[' . $linkTarget . ']]';
813	$state->emitChunk( '', $node ); // Flush separators for this node
814	// Flush separators for this node
815	return;
816	}
817
818	// Set to some non-null string
819	$state->redirectText = 'unbuffered';
820	}
821
822	$pipedText = null;
823	if ( $escapedTgt && $escapedTgt->invalidLink ) {
824	// If the link target was invalid, instead of emitting an invalid link,
825	// omit the link and serialize just the content instead. But, log the
826	// invalid html for Parsoid clients to investigate later.
827	$state->getEnv()->log(
828	'error/html2wt/link', 'Bad title text', DOMCompat::getOuterHTML( $node )
829	);
830
831	// For non-piped content, use the original invalid link text
832	$pipedText = $isPiped ? $contentSrc : $linkTarget;
833	$state->needsEscaping = $needsEscaping;
834	$state->emitChunk( $linkData->prefix . $pipedText . $linkData->tail, $node );
835	} else {
836	if ( $isPiped && $needsEscaping ) {
837	// We are definitely not in sol context since content
838	// will be preceded by "[[" or "[" text in target wikitext.
839	$pipedText = '\|' . $state->serializer->wteHandlers
840	->escapeLinkContent( $state, $contentSrc, false, $node, false );
841	} elseif ( $isPiped ) {
842	$pipedText = '\|' . $contentSrc;
843	} else {
844	$pipedText = '';
845	}
846	if ( $isPiped ) {
847	$state->singleLineContext->disable();
848	}
849	$state->emitChunk( new WikiLinkText(
850	$linkData->prefix . '[[' . $linkTarget . $pipedText . ']]' . $linkData->tail,
851	$node, $siteConfig, $linkData->type
852	), $node );
853	if ( $isPiped ) {
854	$state->singleLineContext->pop();
855	}
856	}
857	}
858
859	/**
860	* Serialize as external link
861	* @param Element $node
862	* @param SerializerState $state
863	* @param stdClass $linkData
864	*/
865	private static function serializeAsExtLink(
866	Element $node, SerializerState $state, stdClass $linkData
867	): void {
868	$target = $linkData->target;
869	$urlStr = $target['value'];
870	if ( !empty( $target['modified'] ) \|\| empty( $target['fromsrc'] ) ) {
871	// We expect modified hrefs to be percent-encoded already, so
872	// don't need to encode them here any more. Unmodified hrefs are
873	// just using the original encoding anyway.
874	// BUT we do have to encode certain special wikitext
875	// characters (like []) which aren't necessarily
876	// percent-encoded because they are valid in URLs and HTML5
877	$urlStr = self::escapeExtLinkURL( $urlStr );
878	}
879
880	if ( self::isURLLink( $state->getEnv(), $node, $linkData ) ) {
881	// Serialize as URL link
882	$state->emitChunk( new AutoURLLinkText( $urlStr, $node ), $node );
883	return;
884	}
885
886	$siteConfig = $state->getEnv()->getSiteConfig();
887
888	// TODO: match vs. interwikis too
889	$magicLinkMatch = $siteConfig->getExtResourceURLPatternMatcher()(
890	Utils::decodeURI( $linkData->origHref )
891	);
892	$pureHashMatch = substr( $urlStr, 0, 1 ) === '#';
893	// Fully serialize the content
894	$contentStr = $state->serializeLinkChildrenToString(
895	$node,
896	[ $state->serializer->wteHandlers, $pureHashMatch ? 'wikilinkHandler' : 'aHandler' ]
897	);
898	// First check for ISBN/RFC/PMID links. We rely on selser to
899	// preserve non-minimal forms.
900	if ( $magicLinkMatch ) {
901	$serialized = $siteConfig->makeExtResourceURL(
902	$magicLinkMatch, $target['value'], $contentStr
903	);
904	if ( $serialized[0] === '[' ) {
905	// Serialization as a magic link failed (perhaps the
906	// content string wasn't appropriate).
907	$state->emitChunk(
908	( $magicLinkMatch[0] === 'ISBN' ) ?
909	new WikiLinkText( $serialized, $node, $siteConfig, 'mw:WikiLink' ) :
910	new ExtLinkText( $serialized, $node, $siteConfig, 'mw:ExtLink' ),
911	$node
912	);
913	} else {
914	$state->emitChunk( new MagicLinkText( $serialized, $node ), $node );
915	}
916	return;
917	} else {
918	// serialize as auto-numbered external link
919	// [http://example.com]
920	$linktext = null;
921	$class = null;
922	// If it's just anchor text, serialize as an internal link.
923	if ( $pureHashMatch ) {
924	$class = WikiLinkText::class;
925	$linktext = '[[' . $urlStr . ( ( $contentStr ) ? '\|' . $contentStr : '' ) . ']]';
926	} else {
927	$class = ExtLinkText::class;
928	$linktext = '[' . $urlStr . ( ( $contentStr ) ? ' ' . $contentStr : '' ) . ']';
929	}
930	$state->emitChunk( new $class( $linktext, $node, $siteConfig, $linkData->type ), $node );
931	return;
932	}
933	}
934
935	/**
936	* Main link handler.
937	* @param SerializerState $state
938	* @param Element $node
939	*/
940	public static function linkHandler( SerializerState $state, Element $node ): void {
941	// TODO: handle internal/external links etc using RDFa and dataParsoid
942	// Also convert unannotated html links without advanced attributes to
943	// external wiki links for html import. Might want to consider converting
944	// relative links without path component and file extension to wiki links.
945	$env = $state->getEnv();
946	$siteConfig = $env->getSiteConfig();
947
948	// Get the rt data from the token and tplAttrs
949	$linkData = self::getLinkRoundTripData( $env, $node, $state );
950	$linkType = $linkData->type;
951	if ( $siteConfig->getExtResourceURLPatternMatcher()( Utils::decodeURI( $linkData->origHref ) ) ) {
952	// Override the 'rel' type if this is a magic link
953	$linkType = 'mw:ExtLink';
954	}
955	if ( $linkType !== null && isset( $linkData->target['value'] ) ) {
956	// We have a type and target info
957	if ( $linkType === 'mw:WikiLink' \|\| $linkType === 'mw:MediaLink' \|\|
958	preg_match( TokenUtils::SOL_TRANSPARENT_LINK_REGEX, $linkType )
959	) {
960	// [[..]] links: normal, category, redirect, or lang links
961	// (except images)
962	self::serializeAsWikiLink( $node, $state, $linkData );
963	return;
964	} elseif ( $linkType === 'mw:ExtLink' ) {
965	// [..] links, autolinks, ISBN, RFC, PMID
966	self::serializeAsExtLink( $node, $state, $linkData );
967	return;
968	} else {
969	throw new UnexpectedValueException(
970	'Unhandled link serialization scenario: ' . DOMCompat::getOuterHTML( $node )
971	);
972	}
973	} else {
974	$safeAttr = [
975	'href' => true,
976	'rel' => true,
977	'class' => true,
978	'title' => true,
979	DOMDataUtils::DATA_OBJECT_ATTR_NAME => true
980	];
981
982	$isComplexLink = false;
983	foreach ( DOMUtils::attributes( $node ) as $name => $value ) {
984	// XXX: Don't drop rel and class in every case once a tags are
985	// actually supported in the MW default config?
986	if ( !isset( $safeAttr[$name] ) ) {
987	$isComplexLink = true;
988	break;
989	}
990	}
991
992	if ( $isComplexLink ) {
993	$env->log( 'error/html2wt/link', 'Encountered', DOMCompat::getOuterHTML( $node ),
994	'-- serializing as extlink and dropping <a> attributes unsupported in wikitext.'
995	);
996	} else {
997	$media = DOMUtils::selectMediaElt( $node ); // TODO: Handle missing media too
998	$isFigure = $media instanceof Element && $media->parentNode === $node;
999	if ( $isFigure ) {
1000	// this is a basic html figure: <a><img></a>
1001	self::figureHandler( $state, $node, new MediaStructure( $media, $node ) );
1002	return;
1003	}
1004	}
1005
1006	// href is already percent-encoded, etc., but it might contain
1007	// spaces or other wikitext nasties. escape the nasties.
1008	$hrefStr = self::escapeExtLinkURL( self::getHref( $env, $node ) );
1009	$handler = [ $state->serializer->wteHandlers, 'aHandler' ];
1010	$str = $state->serializeLinkChildrenToString( $node, $handler );
1011	$chunk = null;
1012	if ( !$hrefStr ) {
1013	// Without an href, we just emit the string as text.
1014	// However, to preserve targets for anchor links,
1015	// serialize as a span with a name.
1016	$name = DOMCompat::getAttribute( $node, 'name' );
1017	if ( $name !== null ) {
1018	$doc = $node->ownerDocument;
1019	$span = $doc->createElement( 'span' );
1020	$span->setAttribute( 'name', $name );
1021	$span->appendChild( $doc->createTextNode( $str ) );
1022	$chunk = DOMCompat::getOuterHTML( $span );
1023	} else {
1024	$chunk = $str;
1025	}
1026	} else {
1027	$chunk = new ExtLinkText( '[' . $hrefStr . ' ' . $str . ']',
1028	$node, $siteConfig, 'mw:ExtLink'
1029	);
1030	}
1031	$state->emitChunk( $chunk, $node );
1032	}
1033	}
1034
1035	/**
1036	* Main figure handler.
1037	*
1038	* @param SerializerState $state
1039	* @param Element $node
1040	* @param ?MediaStructure $ms
1041	*/
1042	public static function figureHandler(
1043	SerializerState $state, Element $node, ?MediaStructure $ms
1044	): void {
1045	if ( !$ms ) {
1046	$state->getEnv()->log(
1047	'error/html2wt/figure',
1048	"Couldn't parse media structure: ",
1049	DOMCompat::getOuterHTML( $node )
1050	);
1051	return;
1052	}
1053	$ct = self::figureToConstrainedText( $state, $ms );
1054	$state->emitChunk( $ct ?? '', $node );
1055	}
1056
1057	/**
1058	* Serialize a figure to contrained text.
1059	*
1060	* WARN: There's probably more to do to ensure this is purely functional,
1061	* no side-effects (ie. calls to state->emit) happen while processing.
1062	*
1063	* @param SerializerState $state
1064	* @param MediaStructure $ms
1065	* @return ?ConstrainedText
1066	*/
1067	public static function figureToConstrainedText(
1068	SerializerState $state, MediaStructure $ms
1069	): ?ConstrainedText {
1070	$env = $state->getEnv();
1071	$outerElt = $ms->containerElt ?? $ms->mediaElt;
1072	$linkElt = $ms->linkElt;
1073	$elt = $ms->mediaElt;
1074	$captionElt = $ms->captionElt;
1075	$format = WTUtils::getMediaFormat( $outerElt );
1076
1077	// Try to identify the local title to use for this image.
1078	$resource = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'resource' );
1079	if ( !isset( $resource['value'] ) ) {
1080	// from non-parsoid HTML: try to reconstruct resource from src?
1081	// (this won't work for manual-thumb images)
1082	$src = DOMCompat::getAttribute( $elt, 'src' );
1083	if ( $src === null ) {
1084	$env->log( 'error/html2wt/figure',
1085	'In WSP.figureHandler, img does not have resource or src:',
1086	DOMCompat::getOuterHTML( $outerElt )
1087	);
1088	return null;
1089	}
1090	if ( preg_match( '/^https?:/', $src ) ) {
1091	// external image link, presumably $wgAllowExternalImages=true
1092	return new AutoURLLinkText( $src, $outerElt );
1093	}
1094	$resource = [
1095	'value' => $src,
1096	'fromsrc' => false,
1097	'modified' => false
1098	];
1099	}
1100	if ( empty( $resource['fromsrc'] ) ) {
1101	$resource['value'] = preg_replace( '#^(\.\.?/)+#', '', $resource['value'], 1 );
1102	}
1103
1104	$nopts = [];
1105	$outerDP = DOMDataUtils::getDataParsoid( $outerElt );
1106	$outerDMW = DOMDataUtils::getDataMw( $outerElt );
1107	$mwAliases = $state->getEnv()->getSiteConfig()->mwAliases();
1108
1109	// Return ref to the array element in case it is modified
1110	$getOpt = static function & ( $key ) use ( &$outerDP ): ?array {
1111	$null = null;
1112	if ( empty( $outerDP->optList ) ) {
1113	return $null;
1114	}
1115	foreach ( $outerDP->optList as $opt ) {
1116	if ( ( $opt['ck'] ?? null ) === $key ) {
1117	return $opt;
1118	}
1119	}
1120	return $null;
1121	};
1122	// Return ref to the array element in case it is modified
1123	$getLastOpt = static function & ( $key ) use ( &$outerDP ): ?array {
1124	$null = null;
1125	$opts = $outerDP->optList ?? [];
1126	for ( $i = count( $opts ) - 1; $i >= 0; $i-- ) {
1127	if ( ( $opts[$i]['ck'] ?? null ) === $key ) {
1128	return $opts[$i];
1129	}
1130	}
1131	return $null;
1132	};
1133
1134	// Try to identify the local title to use for the link.
1135	$link = null;
1136
1137	$linkFromDataMw = WTSUtils::getAttrFromDataMw( $outerDMW, 'link', true );
1138	if ( $linkFromDataMw !== null ) {
1139	// "link" attribute on the `outerElt` takes precedence
1140	if ( isset( $linkFromDataMw[1]->html ) ) {
1141	$link = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, 'link' );
1142	} else {
1143	$link = [
1144	'value' => "link={$linkFromDataMw[1]->txt}",
1145	'modified' => false,
1146	'fromsrc' => false,
1147	'fromDataMW' => true
1148	];
1149	}
1150	} elseif ( $linkElt && $linkElt->hasAttribute( 'href' ) ) {
1151	$link = $state->serializer->serializedImageAttrVal( $outerElt, $linkElt, 'href' );
1152	if ( empty( $link['fromsrc'] ) ) {
1153	// strip page or lang parameter if present on href
1154	$strippedHref = preg_replace(
1155	'#[?]((?:page=\d+)\|(?:lang=[a-z]+(?:-[a-z]+)*))$#Di',
1156	'',
1157	DOMCompat::getAttribute( $linkElt, 'href' ) ?? ''
1158	);
1159	if ( $strippedHref === DOMCompat::getAttribute( $elt, 'resource' ) ) {
1160	// default link: same place as resource
1161	$link = $resource;
1162	}
1163	$link['value'] = preg_replace( '#^(\.\.?/)+#', '', $link['value'], 1 );
1164	}
1165	} else {
1166	// Otherwise, just try and get it from data-mw
1167	$link = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, 'href' );
1168	}
1169
1170	if ( $link && empty( $link['modified'] ) && empty( $link['fromsrc'] ) ) {
1171	$linkOpt = $getOpt( 'link' );
1172	if ( $linkOpt ) {
1173	$link['fromsrc'] = true;
1174	$link['value'] = $linkOpt['ak'];
1175	}
1176	}
1177
1178	// Reconstruct the caption
1179	if ( !$captionElt && is_string( $outerDMW->caption ?? null ) ) {
1180	// IMPORTANT: Assign to a variable to prevent the fragment
1181	// from getting GCed before we are done with it.
1182	$fragment = ContentUtils::createAndLoadDocumentFragment(
1183	$outerElt->ownerDocument, $outerDMW->caption,
1184	[ 'markNew' => true ]
1185	);
1186	// FIXME: We should just be able to serialize the children of the
1187	// fragment, however, we need some way of marking this as being
1188	// inInsertedContent so that any bare text is assured to be escaped
1189	$captionElt = $outerElt->ownerDocument->createElement( 'div' );
1190	DOMDataUtils::getDataParsoid( $captionElt )->setTempFlag( TempData::IS_NEW );
1191	DOMUtils::migrateChildren( $fragment, $captionElt );
1192	// Needs a parent node in order for WTS to be happy
1193	$fragment->appendChild( $captionElt );
1194	}
1195
1196	$caption = null;
1197	if ( $captionElt ) {
1198	$caption = $state->serializeCaptionChildrenToString(
1199	$captionElt, [ $state->serializer->wteHandlers, 'mediaOptionHandler' ]
1200	);
1201
1202	// Alt stuff
1203	if ( !WTUtils::hasVisibleCaption( $outerElt ) && $elt->hasAttribute( 'alt' ) ) {
1204	$altOnElt = trim( DOMCompat::getAttribute( $elt, 'alt' ) ?? '' );
1205	$altFromCaption = trim( WTUtils::textContentFromCaption( $captionElt ) );
1206	// The first condition is to support an empty \alt=\ option
1207	// when no caption is present
1208	if ( $altOnElt && ( $altOnElt === $altFromCaption ) ) {
1209	$elt->removeAttribute( 'alt' );
1210	}
1211	}
1212	}
1213
1214	// Fetch the alt (if any)
1215	$alt = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'alt' );
1216	// Fetch the lang (if any)
1217	$lang = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'lang' );
1218	// Fetch the muted (if any)
1219	$muted = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'muted' );
1220	// Fetch the loop (if any)
1221	$loop = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'loop' );
1222
1223	// Ok, start assembling options, beginning with link & alt & lang
1224	// Other media don't have links in output.
1225	$linkCond = DOMCompat::nodeName( $elt ) === 'img';
1226	if ( $linkCond && $link ) {
1227	// Check whether the link goes to the default place, in which
1228	// case an explicit link tag isn't needed.
1229	// The link may be external, or may include wikitext template markup,
1230	// therefore check first that it parses to a title.
1231	$linkTitle = $env->normalizedTitleKey(
1232	Utils::decodeURIComponent( $link['value'] ), true
1233	);
1234	$resourceTitle = $env->normalizedTitleKey(
1235	Utils::decodeURIComponent( $resource['value'] ), true
1236	);
1237	if (
1238	$link['value'] === $resource['value'] \|\|
1239	( $linkTitle !== null && $linkTitle === $resourceTitle )
1240	) {
1241	$linkCond = false; // No explicit link attribute needed
1242	}
1243	}
1244
1245	// "alt" for non-image is handle below
1246	$altCond = $alt['value'] !== null && DOMCompat::nodeName( $elt ) === 'img';
1247
1248	// This loop handles media options which mostly correspond 1-1 with
1249	// HTML attributes. `img_$name` is the name of the media option,
1250	// and $value is the Parsoid "shadow info" for the attribute.
1251	// $cond tells us whether we need to explicitly output this option;
1252	// if it is false we are using an implicit default.
1253	// `lang` and `alt` are fairly straightforward. `link`
1254	// is a little trickier, since we need to massage/fake the shadow
1255	// info because it doesn't come directly from the attribute.
1256	// link comes from the combination of a[href], img[src], and
1257	// img[resource], etc;
1258	foreach ( [
1259	[ 'name' => 'link', 'value' => $link, 'cond' => $linkCond, 'alias' => 'img_link' ],
1260	[ 'name' => 'alt', 'value' => $alt, 'cond' => $altCond, 'alias' => 'img_alt' ],
1261	[ 'name' => 'lang', 'value' => $lang, 'cond' => isset( $lang['value'] ), 'alias' => 'img_lang' ],
1262	[ 'name' => 'muted', 'value' => $muted, 'cond' => isset( $muted['value'] ), 'alias' => 'timedmedia_muted' ],
1263	[ 'name' => 'loop', 'value' => $loop, 'cond' => isset( $loop['value'] ), 'alias' => 'timedmedia_loop' ],
1264	] as $o ) {
1265	if ( !$o['cond'] ) {
1266	continue;
1267	}
1268	if ( $o['value'] && !empty( $o['value']['fromsrc'] ) ) {
1269	$nopts[] = [
1270	'ck' => $o['name'],
1271	'ak' => [ $o['value']['value'] ],
1272	];
1273	} else {
1274	$value = $o['value'] ? $o['value']['value'] : '';
1275	if ( $o['value'] && in_array( $o['name'], [ 'link', 'alt' ], true ) ) {
1276	// see WikiLinkHandler::isWikitextOpt(): link and alt are allowed
1277	// to contain arbitrary wikitext, even though it is stripped
1278	// to a string before emitting.
1279	$value = $state->serializer->wteHandlers->escapeLinkContent(
1280	$state, $value, false, $outerElt, true
1281	);
1282	}
1283	$nopts[] = [
1284	'ck' => $o['name'],
1285	'v' => $value,
1286	'ak' => $mwAliases[$o['alias']],
1287	];
1288	}
1289	}
1290
1291	// Now we handle media options which all come from space-separated
1292	// values in a single HTML attribute, `class`. (But note that there
1293	// can also be "extra" classes added by `img_class` as well.)
1294	$classes = DOMCompat::getClassList( $outerElt );
1295	$extra = []; // 'extra' classes
1296	$val = null;
1297
1298	foreach ( $classes as $c ) {
1299	switch ( $c ) {
1300	case 'mw-halign-none':
1301	case 'mw-halign-right':
1302	case 'mw-halign-left':
1303	case 'mw-halign-center':
1304	$val = substr( $c, 10 ); // strip mw-halign- prefix
1305	$nopts[] = [
1306	'ck' => $val,
1307	'ak' => $mwAliases['img_' . $val],
1308	];
1309	break;
1310
1311	case 'mw-valign-top':
1312	case 'mw-valign-middle':
1313	case 'mw-valign-baseline':
1314	case 'mw-valign-sub':
1315	case 'mw-valign-super':
1316	case 'mw-valign-text-top':
1317	case 'mw-valign-bottom':
1318	case 'mw-valign-text-bottom':
1319	$val = strtr( substr( $c, 10 ), '-', '_' ); // strip mw-valign and '-' to '_'
1320	$nopts[] = [
1321	'ck' => $val,
1322	'ak' => $mwAliases['img_' . $val],
1323	];
1324	break;
1325
1326	case 'mw-image-border':
1327	$nopts[] = [
1328	'ck' => 'border',
1329	'ak' => $mwAliases['img_border'],
1330	];
1331	break;
1332
1333	case 'mw-default-size':
1334	case 'mw-default-audio-height':
1335	// handled below
1336	break;
1337
1338	default:
1339	$extra[] = $c;
1340	break;
1341	}
1342	}
1343
1344	if ( count( $extra ) ) {
1345	$nopts[] = [
1346	'ck' => 'class',
1347	'v' => implode( ' ', $extra ),
1348	'ak' => $mwAliases['img_class'],
1349	];
1350	}
1351
1352	// Now we handle parameters which don't have a representation
1353	// as HTML attributes; they are set only from the data-mw
1354	// values. (In theory they could perhaps be reverse engineered
1355	// from the thumbnail URL, but that would be fragile and expose
1356	// thumbnail implementation to the editor so we don't do that.)
1357	$mwParams = [
1358	[ 'prop' => 'thumb', 'ck' => 'manualthumb', 'alias' => 'img_manualthumb' ],
1359	[ 'prop' => 'page', 'ck' => 'page', 'alias' => 'img_page' ],
1360	// Video specific
1361	[ 'prop' => 'starttime', 'ck' => 'starttime', 'alias' => 'timedmedia_starttime' ],
1362	[ 'prop' => 'endtime', 'ck' => 'endtime', 'alias' => 'timedmedia_endtime' ],
1363	[ 'prop' => 'thumbtime', 'ck' => 'thumbtime', 'alias' => 'timedmedia_thumbtime' ]
1364	];
1365
1366	// `img_link` and `img_alt` are only surfaced as HTML attributes
1367	// for image media. For all other media we treat them as set only
1368	// from data-mw.
1369	if ( DOMCompat::nodeName( $elt ) !== 'img' ) {
1370	$mwParams[] = [ 'prop' => 'link', 'ck' => 'link', 'alias' => 'img_link' ];
1371	$mwParams[] = [ 'prop' => 'alt', 'ck' => 'alt', 'alias' => 'img_alt' ];
1372	}
1373
1374	$hasManualthumb = false;
1375	foreach ( $mwParams as $o ) {
1376	$v = $outerDMW->{$o['prop']} ?? null;
1377	if ( $v === null ) {
1378	$a = WTSUtils::getAttrFromDataMw( $outerDMW, $o['ck'], true );
1379	if ( $a !== null ) {
1380	if ( isset( $a[1]->html ) ) {
1381	$si = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, $o['ck'] );
1382	if ( isset( $si['value'] ) ) {
1383	$nopts[] = [
1384	'ck' => $o['ck'],
1385	'ak' => [ $si['value'] ],
1386	];
1387	continue;
1388	}
1389	} else {
1390	$v = $a[1]->txt;
1391	}
1392	}
1393	}
1394	if ( $v !== null ) {
1395	$ak = $state->serializer->getAttributeValue(
1396	$outerElt, $o['ck']
1397	) ?? $mwAliases[$o['alias']];
1398	$nopts[] = [
1399	'ck' => $o['ck'],
1400	'ak' => $ak,
1401	'v' => $v
1402	];
1403	// Piggyback this here ...
1404	if ( $o['prop'] === 'thumb' ) {
1405	$hasManualthumb = true;
1406	$format = '';
1407	}
1408	}
1409	}
1410
1411	// These media options come from the HTML `typeof` attribute.
1412	switch ( $format ) {
1413	case 'Thumb':
1414	$nopts[] = [
1415	'ck' => 'thumbnail',
1416	'ak' => $state->serializer->getAttributeValue(
1417	$outerElt, 'thumbnail'
1418	) ?? $mwAliases['img_thumbnail'],
1419	];
1420	break;
1421	case 'Frame':
1422	$nopts[] = [
1423	'ck' => 'framed',
1424	'ak' => $state->serializer->getAttributeValue(
1425	$outerElt, 'framed'
1426	) ?? $mwAliases['img_framed'],
1427	];
1428	break;
1429	case 'Frameless':
1430	$nopts[] = [
1431	'ck' => 'frameless',
1432	'ak' => $state->serializer->getAttributeValue(
1433	$outerElt, 'frameless'
1434	) ?? $mwAliases['img_frameless'],
1435	];
1436	break;
1437	}
1438
1439	// Now handle the size-related options. This is complicated!
1440	// We consider the `height`, `data-height`, `width`, and
1441	// `data-width` attributes, as well as the `typeof` and the `class`.
1442
1443	// Get the user-specified height from wikitext
1444	$wh = $state->serializer->serializedImageAttrVal(
1445	$outerElt, $elt, $ms->isRedLink() ? 'data-height' : 'height'
1446	);
1447	// Get the user-specified width from wikitext
1448	$ww = $state->serializer->serializedImageAttrVal(
1449	$outerElt, $elt, $ms->isRedLink() ? 'data-width' : 'width'
1450	);
1451
1452	$sizeUnmodified = !empty( $ww['fromDataMW'] ) \|\|
1453	( empty( $ww['modified'] ) && empty( $wh['modified'] ) );
1454	$upright = $getOpt( 'upright' );
1455
1456	// XXX: Infer upright factor from default size for all thumbs by default?
1457	// Better for scaling with user prefs, but requires knowledge about
1458	// default used in VE.
1459	if ( $sizeUnmodified && $upright &&
1460	// Only serialize upright where it is actually respected
1461	// This causes some dirty diffs, but makes sure that we don't
1462	// produce nonsensical output after a type switch.
1463	// TODO: Only strip if type was actually modified.
1464	in_array( $format, [ 'Frameless', 'Thumb' ], true )
1465	) {
1466	// preserve upright option
1467	$nopts[] = [
1468	'ck' => $upright['ck'],
1469	'ak' => [ $upright['ak'] ], // FIXME: don't use ak here!
1470	];
1471	}
1472
1473	if (
1474	!DOMUtils::hasClass( $outerElt, 'mw-default-size' ) &&
1475	$format !== 'Frame' && !$hasManualthumb
1476	) {
1477	$size = $getLastOpt( 'width' );
1478	$sizeString = (string)( $size['ak'] ?? '' );
1479	if ( $sizeString === '' && !empty( $ww['fromDataMW'] ) ) {
1480	$sizeString = (string)( $ww['value'] ?? '' );
1481	}
1482	if ( $sizeUnmodified && $sizeString !== '' ) {
1483	// preserve original width/height string if not touched
1484	$nopts[] = [
1485	'ck' => 'width',
1486	'v' => $sizeString, // original size string
1487	'ak' => [ '$1' ], // don't add px or the like
1488	];
1489	} else {
1490	$bbox = null;
1491	// Serialize to a square bounding box
1492	if ( isset( $ww['value'] ) && preg_match( '/^\d+/', $ww['value'] ) ) {
1493	$bbox = intval( $ww['value'] );
1494	}
1495	if ( isset( $wh['value'] ) && preg_match( '/^\d+/', $wh['value'] ) &&
1496	// As with "mw-default-size", editing clients should remove the
1497	// "mw-default-audio-height" if they want to factor a defined
1498	// height into the bounding box size. However, note that, at
1499	// present, a defined height for audio is ignored while parsing,
1500	// so this only has the effect of modifying the width.
1501	(
1502	DOMCompat::nodeName( $elt ) !== 'audio' \|\|
1503	!DOMUtils::hasClass( $outerElt, 'mw-default-audio-height' )
1504	)
1505	) {
1506	$height = intval( $wh['value'] );
1507	if ( $bbox === null \|\| $height > $bbox ) {
1508	$bbox = $height;
1509	}
1510	}
1511	if ( $bbox !== null ) {
1512	$nopts[] = [
1513	'ck' => 'width',
1514	// MediaWiki interprets 100px as a width
1515	// restriction only, so we need to make the bounding
1516	// box explicitly square (100x100px). The 'px' is
1517	// added by the alias though, and can be localized.
1518	'v' => $bbox . 'x' . $bbox,
1519	'ak' => $mwAliases['img_width'], // adds the 'px' suffix
1520	];
1521	}
1522	}
1523	}
1524
1525	$opts = $outerDP->optList ?? []; // original wikitext options
1526
1527	// Add bogus options from old optlist in order to round-trip cleanly (T64500)
1528	foreach ( $opts as $o ) {
1529	if ( ( $o['ck'] ?? null ) === 'bogus' ) {
1530	$nopts[] = [
1531	'ck' => 'bogus',
1532	'ak' => [ $o['ak'] ],
1533	];
1534	}
1535	}
1536
1537	// Put the caption last, by default.
1538	if ( is_string( $caption ) ) {
1539	$nopts[] = [
1540	'ck' => 'caption',
1541	'ak' => [ $caption ],
1542	];
1543	}
1544
1545	// ok, sort the new options to match the order given in the old optlist
1546	// and try to match up the aliases used
1547	$changed = false;
1548	foreach ( $nopts as &$no ) {
1549	// Make sure we have an array here. Default in data-parsoid is
1550	// actually a string.
1551	// FIXME: don't reuse ak for two different things!
1552	if ( !is_array( $no['ak'] ) ) {
1553	$no['ak'] = [ $no['ak'] ];
1554	}
1555
1556	$no['sortId'] = count( $opts );
1557	$idx = -1;
1558	foreach ( $opts as $i => $o ) {
1559	if ( ( $o['ck'] ?? null ) === $no['ck'] &&
1560	// for bogus options, make sure the source matches too.
1561	( $o['ck'] !== 'bogus' \|\| $o['ak'] === $no['ak'][0] )
1562	) {
1563	$idx = $i;
1564	break;
1565	}
1566	}
1567	if ( $idx < 0 ) {
1568	// Preferred words are first in the alias list
1569	// (but not in old versions of mediawiki).
1570	$no['ak'] = $no['ak'][0];
1571	$changed = true;
1572	continue;
1573	}
1574
1575	$no['sortId'] = $idx;
1576	// use a matching alias, if there is one
1577	$a = null;
1578	foreach ( $no['ak'] as $b ) {
1579	// note the trim() here; that allows us to snarf eccentric
1580	// whitespace from the original option wikitext
1581	$b2 = $b;
1582	if ( isset( $no['v'] ) ) {
1583	$b2 = str_replace( '$1', $no['v'], $b );
1584	}
1585	if ( $b2 === trim( implode( ',', (array)$opts[$idx]['ak'] ) ) ) {
1586	$a = $b;
1587	break;
1588	}
1589	}
1590	// use the alias (incl whitespace) from the original option wikitext
1591	// if found; otherwise use the last alias given (English default by
1592	// convention that works everywhere).
1593	// TODO: use first alias (localized) instead for RTL languages (T53852)
1594	if ( $a !== null && $no['ck'] !== 'caption' ) {
1595	$no['ak'] = $opts[$idx]['ak'];
1596	unset( $no['v'] ); // prevent double substitution
1597	} else {
1598	$no['ak'] = PHPUtils::lastItem( $no['ak'] );
1599	if ( !( $no['ck'] === 'caption' && $a !== null ) ) {
1600	$changed = true;
1601	}
1602	}
1603	}
1604
1605	// Filter out bogus options if the image options/caption have changed.
1606	if ( $changed ) {
1607	$nopts = array_filter( $nopts, static function ( $no ) {
1608	return $no['ck'] !== 'bogus';
1609	} );
1610	// empty captions should get filtered out in this case, too (T64264)
1611	$nopts = array_filter( $nopts, static function ( $no ) {
1612	return !( $no['ck'] === 'caption' && $no['ak'] === '' );
1613	} );
1614	}
1615
1616	// sort!
1617	usort( $nopts, static function ( $a, $b ) {
1618	return $a['sortId'] <=> $b['sortId'];
1619	} );
1620
1621	// emit all the options as wikitext!
1622	$wikitext = '[[' . $resource['value'];
1623	foreach ( $nopts as $o ) {
1624	$wikitext .= '\|';
1625	if ( isset( $o['v'] ) ) {
1626	$wikitext .= str_replace( '$1', $o['v'], $o['ak'] );
1627	} else {
1628	$wikitext .= $o['ak'];
1629	}
1630	}
1631	$wikitext .= ']]';
1632
1633	return new WikiLinkText(
1634	$wikitext, $outerElt, $state->getEnv()->getSiteConfig(), 'mw:File'
1635	);
1636	}
1637
1638	}