Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 810 |
|
0.00% |
0 / 16 |
CRAP | |
0.00% |
0 / 1 |
LinkHandlerUtils | |
0.00% |
0 / 810 |
|
0.00% |
0 / 16 |
86730 | |
0.00% |
0 / 1 |
splitLinkContentString | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
30 | |||
getHref | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
56 | |||
normalizeIWP | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
escapeLinkTarget | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
getContentString | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
42 | |||
getLinkRoundTripData | |
0.00% |
0 / 122 |
|
0.00% |
0 / 1 |
2756 | |||
escapeExtLinkURL | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
addColonEscape | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
isURLLink | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
56 | |||
hasAutoUrlTerminatingChars | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
isSimpleWikiLink | |
0.00% |
0 / 49 |
|
0.00% |
0 / 1 |
306 | |||
serializeAsWikiLink | |
0.00% |
0 / 125 |
|
0.00% |
0 / 1 |
2352 | |||
serializeAsExtLink | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
72 | |||
linkHandler | |
0.00% |
0 / 68 |
|
0.00% |
0 / 1 |
380 | |||
figureHandler | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
6 | |||
figureToConstrainedText | |
0.00% |
0 / 342 |
|
0.00% |
0 / 1 |
12882 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use stdClass; |
7 | use UnexpectedValueException; |
8 | use Wikimedia\Parsoid\Config\Env; |
9 | use Wikimedia\Parsoid\Core\MediaStructure; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\DOM\Node; |
12 | use Wikimedia\Parsoid\DOM\Text; |
13 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\AutoURLLinkText; |
14 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ConstrainedText; |
15 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ExtLinkText; |
16 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\MagicLinkText; |
17 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\WikiLinkText; |
18 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
19 | use Wikimedia\Parsoid\NodeData\TempData; |
20 | use Wikimedia\Parsoid\Utils\ContentUtils; |
21 | use Wikimedia\Parsoid\Utils\DOMCompat; |
22 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
23 | use Wikimedia\Parsoid\Utils\DOMUtils; |
24 | use Wikimedia\Parsoid\Utils\PHPUtils; |
25 | use Wikimedia\Parsoid\Utils\TokenUtils; |
26 | use Wikimedia\Parsoid\Utils\UrlUtils; |
27 | use Wikimedia\Parsoid\Utils\Utils; |
28 | use Wikimedia\Parsoid\Utils\WTUtils; |
29 | use Wikimedia\Parsoid\Wt2Html\TokenizerUtils; |
30 | |
31 | /** |
32 | * Serializes link markup. |
33 | */ |
34 | class LinkHandlerUtils { |
35 | private static $REDIRECT_TEST_RE = '/^([ \t\n\r\0\x0b])*$/D'; |
36 | private static $MW_TITLE_WHITESPACE_RE |
37 | = '/[ _\xA0\x{1680}\x{180E}\x{2000}-\x{200A}\x{2028}\x{2029}\x{202F}\x{205F}\x{3000}]+/u'; |
38 | |
39 | /** |
40 | * Split a string based on a prefix and suffix |
41 | * |
42 | * @param string $contentString |
43 | * @param DataParsoid $dp Containing ->prefix and ->tail |
44 | * @return stdClass |
45 | */ |
46 | private static function splitLinkContentString( string $contentString, DataParsoid $dp ): stdClass { |
47 | $tail = $dp->tail ?? ''; |
48 | $prefix = $dp->prefix ?? ''; |
49 | |
50 | $tailLen = strlen( $tail ); |
51 | if ( $tailLen && substr( $contentString, -$tailLen ) === $tail ) { |
52 | // strip the tail off the content |
53 | $contentString = substr( $contentString, 0, -$tailLen ); |
54 | } else { |
55 | $tail = ''; |
56 | } |
57 | |
58 | $prefixLen = strlen( $prefix ); |
59 | if ( $prefixLen && substr( $contentString, 0, $prefixLen ) === $prefix ) { |
60 | $contentString = substr( $contentString, $prefixLen ); |
61 | } else { |
62 | $prefix = ''; |
63 | } |
64 | |
65 | return (object)[ |
66 | 'contentString' => $contentString, |
67 | 'tail' => $tail, |
68 | 'prefix' => $prefix, |
69 | ]; |
70 | } |
71 | |
72 | /** |
73 | * Helper function for munging protocol-less absolute URLs: |
74 | * If this URL is absolute, but doesn't contain a protocol, |
75 | * try to find a localinterwiki protocol that would work. |
76 | * |
77 | * @param Env $env |
78 | * @param Element $node |
79 | * @return string |
80 | */ |
81 | private static function getHref( Env $env, Element $node ): string { |
82 | $href = DOMCompat::getAttribute( $node, 'href' ) ?? ''; |
83 | if ( ( $href[0] ?? '' ) === '/' && ( $href[1] ?? '' ) !== '/' ) { |
84 | // protocol-less but absolute. let's find a base href |
85 | foreach ( $env->getSiteConfig()->interwikiMapNoNamespaces() as $interwikiInfo ) { |
86 | if ( isset( $interwikiInfo['localinterwiki'] ) && isset( $interwikiInfo['url'] ) ) { |
87 | $base = $interwikiInfo['url']; |
88 | |
89 | // evaluate the url relative to this base |
90 | $nhref = UrlUtils::expandUrl( $href, $base ); |
91 | |
92 | // can this match the pattern? |
93 | $re = '/^' . strtr( preg_quote( $base, '/' ), [ '\\$1' => '.*' ] ) . '$/sD'; |
94 | if ( preg_match( $re, $nhref ) ) { |
95 | return $nhref; |
96 | } |
97 | } |
98 | } |
99 | } |
100 | return $href; |
101 | } |
102 | |
103 | /** |
104 | * Normalize an interwiki prefix (?) |
105 | * @param string $str |
106 | * @return string |
107 | */ |
108 | private static function normalizeIWP( string $str ): string { |
109 | return PHPUtils::stripPrefix( trim( strtolower( $str ) ), ':' ); |
110 | } |
111 | |
112 | /** |
113 | * Escape a link target, and indicate if it's valid |
114 | * @param string $linkTarget |
115 | * @param SerializerState $state |
116 | * @return stdClass |
117 | */ |
118 | private static function escapeLinkTarget( string $linkTarget, SerializerState $state ): stdClass { |
119 | // Entity-escape the content. |
120 | $linkTarget = Utils::escapeWtEntities( $linkTarget ); |
121 | return (object)[ |
122 | 'linkTarget' => $linkTarget, |
123 | // Is this an invalid link? |
124 | 'invalidLink' => !$state->getEnv()->isValidLinkTarget( $linkTarget ) || |
125 | // `isValidLinkTarget` omits fragments (the part after #) so, |
126 | // even though "|" is an invalid character, we still need to ensure |
127 | // it doesn't appear in there. The percent encoded version is fine |
128 | // in the fragment, since it won't break the parse. |
129 | strpos( $linkTarget, '|' ) !== false, |
130 | ]; |
131 | } |
132 | |
133 | /** |
134 | * Get the plain text content of the node, if it can be represented as such |
135 | * |
136 | * NOTE: This function seems a little inconsistent about what's considered |
137 | * null and what's an empty string. For example, no children is null |
138 | * but a single diffMarker gets a string? One of the current callers |
139 | * seems to subtly depend on that though. |
140 | * |
141 | * FIXME(T254501): This function can return `$node->textContent` instead |
142 | * of the string concatenation once mw:DisplaySpace is preprocessed away. |
143 | * |
144 | * @param Node $node |
145 | * @return ?string |
146 | */ |
147 | private static function getContentString( Node $node ): ?string { |
148 | if ( !$node->hasChildNodes() ) { |
149 | return null; |
150 | } |
151 | $contentString = ''; |
152 | $child = $node->firstChild; |
153 | while ( $child ) { |
154 | if ( $child instanceof Text ) { |
155 | $contentString .= $child->nodeValue; |
156 | } elseif ( DOMUtils::hasTypeOf( $child, 'mw:DisplaySpace' ) ) { |
157 | $contentString .= ' '; |
158 | } elseif ( DiffUtils::isDiffMarker( $child ) ) { |
159 | } else { |
160 | return null; |
161 | } |
162 | $child = $child->nextSibling; |
163 | } |
164 | return $contentString; |
165 | } |
166 | |
167 | /** |
168 | * Helper function for getting RT data from the tokens |
169 | * @param Env $env |
170 | * @param Element $node |
171 | * @param SerializerState $state |
172 | * @return stdClass |
173 | */ |
174 | private static function getLinkRoundTripData( |
175 | Env $env, Element $node, SerializerState $state |
176 | ): stdClass { |
177 | $dp = DOMDataUtils::getDataParsoid( $node ); |
178 | $siteConfig = $env->getSiteConfig(); |
179 | $rtData = (object)[ |
180 | 'type' => null, // could be null |
181 | 'href' => null, // filled in below |
182 | 'origHref' => null, // filled in below |
183 | 'target' => null, // filled in below |
184 | 'tail' => $dp->tail ?? '', |
185 | 'prefix' => $dp->prefix ?? '', |
186 | 'linkType' => null |
187 | ]; |
188 | $rtData->content = new stdClass; |
189 | |
190 | // Figure out the type of the link |
191 | if ( $node->hasAttribute( 'rel' ) ) { |
192 | $rel = DOMCompat::getAttribute( $node, 'rel' ) ?? ''; |
193 | // Parsoid only emits and recognizes ExtLink, WikiLink, and PageProp rel values. |
194 | // Everything else defaults to ExtLink during serialization (unless it is |
195 | // serializable to a wikilink) |
196 | // We're keeping the preg_match here instead of going through DOMUtils::matchRel |
197 | // because we have \b guards to handle the multivalue, and we're keeping the matches, |
198 | // which matchRel doesn't do. |
199 | if ( preg_match( '/\b(mw:(WikiLink|ExtLink|MediaLink|PageProp)\S*)\b/', $rel, $typeMatch ) ) { |
200 | $rtData->type = $typeMatch[1]; |
201 | // Strip link subtype info |
202 | if ( $typeMatch[2] === 'WikiLink' || $typeMatch[2] === 'ExtLink' ) { |
203 | $rtData->type = 'mw:' . $typeMatch[2]; |
204 | } |
205 | } |
206 | } |
207 | |
208 | // Default link type if nothing else is set |
209 | if ( $rtData->type === null && !DOMUtils::selectMediaElt( $node ) ) { |
210 | $rtData->type = 'mw:ExtLink'; |
211 | } |
212 | |
213 | // Get href, and save the token's "real" href for comparison |
214 | $href = self::getHref( $env, $node ); |
215 | $rtData->origHref = $href; |
216 | $rtData->href = preg_replace( '#^(\.\.?/)+#', '', $href, 1 ); |
217 | |
218 | // WikiLinks should be relative (but see below); fixup the link type |
219 | // if a WikiLink has an absolute URL. |
220 | // (This may get converted back to a WikiLink below, in the interwiki |
221 | // handling code.) |
222 | if ( $rtData->type === 'mw:WikiLink' && |
223 | ( preg_match( '#^(\w+:)?//#', $rtData->href ) || |
224 | substr( $rtData->origHref ?? '', 0, 1 ) === '/' ) |
225 | ) { |
226 | $rtData->type = 'mw:ExtLink'; |
227 | } |
228 | |
229 | // Now get the target from rt data |
230 | $rtData->target = $state->serializer->serializedAttrVal( $node, 'href' ); |
231 | |
232 | // Check if the link content has been modified or is newly inserted content. |
233 | // FIXME: This will only work with selser of course. Hard to test without selser. |
234 | if ( |
235 | $state->inInsertedContent || |
236 | DiffUtils::hasDiffMark( $node, DiffMarkers::SUBTREE_CHANGED ) |
237 | ) { |
238 | $rtData->contentModified = true; |
239 | } |
240 | |
241 | // Get the content string or tokens |
242 | $contentString = self::getContentString( $node ); |
243 | if ( $contentString !== null ) { |
244 | if ( !empty( $rtData->target['value'] ) && $rtData->target['value'] !== $contentString ) { |
245 | // Try to identify a new potential tail |
246 | $contentParts = self::splitLinkContentString( $contentString, $dp ); |
247 | $rtData->content->string = $contentParts->contentString; |
248 | $rtData->tail = $contentParts->tail; |
249 | $rtData->prefix = $contentParts->prefix; |
250 | } else { |
251 | $rtData->tail = ''; |
252 | $rtData->prefix = ''; |
253 | $rtData->content->string = $contentString; |
254 | } |
255 | } elseif ( $node->hasChildNodes() ) { |
256 | $rtData->contentNode = $node; |
257 | } elseif ( $rtData->type === 'mw:PageProp/redirect' ) { |
258 | $rtData->isRedirect = true; |
259 | $rtData->prefix = $dp->src |
260 | ?? ( ( $siteConfig->mwAliases()['redirect'][0] ?? '#REDIRECT' ) . ' ' ); |
261 | } |
262 | |
263 | // Update link type based on additional analysis. |
264 | // What might look like external links might be serializable as a wikilink. |
265 | $target = &$rtData->target; |
266 | |
267 | // mw:MediaLink annotations are considered authoritative |
268 | // and interwiki link matches aren't made for these |
269 | if ( $rtData->type === 'mw:MediaLink' ) { |
270 | // Parse title from resource attribute (see analog in image handling) |
271 | $resource = $state->serializer->serializedAttrVal( $node, 'resource' ); |
272 | if ( $resource['value'] === null ) { |
273 | // from non-parsoid HTML: try to reconstruct resource from href? |
274 | // (See similar code which tries to guess resource from <img src>) |
275 | $mediaPrefix = $siteConfig->namespaceName( $siteConfig->namespaceId( 'media' ) ); |
276 | $slashPos = strrpos( $rtData->origHref, '/' ); |
277 | $fileName = $slashPos === false ? $rtData->origHref : |
278 | substr( $rtData->origHref, $slashPos + 1 ); |
279 | $resource = [ |
280 | 'value' => $mediaPrefix . ':' . $fileName, |
281 | 'fromsrc' => false, |
282 | 'modified' => false |
283 | ]; |
284 | } |
285 | $rtData->target = $resource; |
286 | $rtData->href = preg_replace( '#^(\.\.?/)+#', '', $rtData->target['value'], 1 ); |
287 | return $rtData; |
288 | } |
289 | |
290 | // Check if the href matches any of our interwiki URL patterns |
291 | $interwikiMatch = $siteConfig->interwikiMatcher( $href ); |
292 | if ( !$interwikiMatch ) { |
293 | return $rtData; |
294 | } |
295 | |
296 | $iw = $siteConfig->interwikiMapNoNamespaces()[ltrim( $interwikiMatch[0], ':' )]; |
297 | $localInterwiki = !empty( $iw['local'] ); |
298 | |
299 | // Only to be used in question mark check, since other checks want to include the fragment |
300 | $targetForQmarkCheck = $interwikiMatch[1]; |
301 | // FIXME: If ever the default value for $wgExternalInterwikiFragmentMode |
302 | // changes, we can reduce this by always stripping off the fragment |
303 | // identifier, since in "html5" mode, that isn't encoded. At present, |
304 | // we can only do that if we know it's a local interwiki link. |
305 | if ( $localInterwiki ) { |
306 | $withoutFragment = strstr( $targetForQmarkCheck, '#', true ); |
307 | if ( $withoutFragment !== false ) { |
308 | $targetForQmarkCheck = $withoutFragment; |
309 | } |
310 | } |
311 | |
312 | if ( |
313 | // Question mark is a valid title char, so it won't fail the test below, |
314 | // but gets percent encoded on the way out since it has special |
315 | // semantics in a url. That will break the url we're serializing, so |
316 | // protect it. |
317 | strpos( $targetForQmarkCheck, '?' ) === false && |
318 | // Ensure we have a valid link target, otherwise falling back to extlink |
319 | // is preferable, since it won't serialize as a link. |
320 | ( |
321 | $interwikiMatch[1] === '' || !self::escapeLinkTarget( |
322 | // Append the prefix since we want to validate the target |
323 | // with respect to it being an interwiki. |
324 | $interwikiMatch[0] . ':' . $interwikiMatch[1], |
325 | $state |
326 | )->invalidLink |
327 | ) && |
328 | // ExtLinks should have content to convert. |
329 | ( |
330 | $rtData->type !== 'mw:ExtLink' || |
331 | !empty( $rtData->content->string ) || |
332 | !empty( $rtData->contentNode ) |
333 | ) && |
334 | ( !empty( $dp->isIW ) || !empty( $target['modified'] ) || !empty( $rtData->contentModified ) ) |
335 | ) { |
336 | // External link that is really an interwiki link. Convert it. |
337 | // TODO: Leaving this for backwards compatibility, remove when 1.5 is no longer bound |
338 | if ( $rtData->type === 'mw:ExtLink' ) { |
339 | $rtData->type = 'mw:WikiLink'; |
340 | } |
341 | $rtData->isInterwiki = true; |
342 | $iwMap = $siteConfig->interwikiMapNoNamespaces(); |
343 | // could this be confused with a language link? |
344 | $iwi = $iwMap[self::normalizeIWP( $interwikiMatch[0] )] ?? null; |
345 | $rtData->isInterwikiLang = $iwi && isset( $iwi['language'] ); |
346 | // is this our own wiki? |
347 | $rtData->isLocal = $iwi && isset( $iwi['localinterwiki'] ); |
348 | // strip off localinterwiki prefixes |
349 | $localPrefix = ''; |
350 | $oldPrefix = null; |
351 | while ( true ) { |
352 | $tmp = substr( $target['value'], strlen( $localPrefix ) ); |
353 | if ( !preg_match( '/^(:?([^:]+)):/', $tmp, $oldPrefix ) ) { |
354 | break; |
355 | } |
356 | $iwi = $iwMap[Utils::normalizeNamespaceName( $oldPrefix[2] )] ?? null; |
357 | if ( !$iwi || !isset( $iwi['localinterwiki'] ) ) { |
358 | break; |
359 | } |
360 | $localPrefix .= $oldPrefix[1] . ':'; |
361 | } |
362 | |
363 | if ( !empty( $target['fromsrc'] ) && empty( $target['modified'] ) ) { |
364 | // Leave the target alone! |
365 | } else { |
366 | if ( $rtData->type === 'mw:PageProp/Language' ) { |
367 | $targetValue = implode( ':', $interwikiMatch ); |
368 | // Strip initial colon |
369 | if ( $targetValue[0] === ':' ) { |
370 | $targetValue = substr( $targetValue, 1 ); |
371 | } |
372 | $target['value'] = $targetValue; |
373 | } elseif ( |
374 | $oldPrefix && ( // Should we preserve the old prefix? |
375 | strcasecmp( $oldPrefix[1], $interwikiMatch[0] ) === 0 || |
376 | // Check if the old prefix mapped to the same URL as |
377 | // the new one. Use the old one if that's the case. |
378 | // Example: [[w:Foo]] vs. [[:en:Foo]] |
379 | ( $iwMap[self::normalizeIWP( $oldPrefix[1] )]['url'] ?? null ) |
380 | === ( $iwMap[self::normalizeIWP( $interwikiMatch[0] )]['url'] ?? null ) |
381 | ) |
382 | ) { |
383 | // Reuse old prefix capitalization |
384 | if ( Utils::decodeWtEntities( substr( $target['value'], strlen( $oldPrefix[1] ) + 1 ) ) |
385 | !== $interwikiMatch[1] |
386 | ) { |
387 | // Modified, update target.value. |
388 | $target['value'] = $localPrefix . $oldPrefix[1] . ':' . $interwikiMatch[1]; |
389 | } |
390 | // Ensure that we generate an interwiki link and not a language link! |
391 | if ( $rtData->isInterwikiLang && $target['value'][0] !== ':' ) { |
392 | $target['value'] = ':' . $target['value']; |
393 | } |
394 | } else { // Else: preserve old encoding |
395 | if ( !empty( $rtData->isLocal ) ) { |
396 | // - interwikiMatch[0] will be something like ":en" or "w" |
397 | // - This tests whether the interwiki-like link is actually |
398 | // a local wikilink. |
399 | |
400 | $target['value'] = $interwikiMatch[1]; |
401 | // interwikiMatch[1] may start with a language link prefix, |
402 | // ensure that we generate interwiki link syntax in that case. (T292022) |
403 | if ( |
404 | preg_match( '/^([^:]+):/', $target['value'], $match ) && |
405 | !empty( $iwMap[self::normalizeIWP( $match[1] )]['language'] ) |
406 | ) { |
407 | $target['value'] = ':' . $target['value']; |
408 | } |
409 | |
410 | $rtData->isInterwiki = $rtData->isInterwikiLang = false; |
411 | } else { |
412 | $target['value'] = implode( ':', $interwikiMatch ); |
413 | } |
414 | } |
415 | } |
416 | } |
417 | |
418 | return $rtData; |
419 | } |
420 | |
421 | /** |
422 | * The provided URL is already percent-encoded -- but it may still |
423 | * not be safe for wikitext. Add additional escapes to make the URL |
424 | * wikitext-safe. Don't touch percent escapes already in the url, |
425 | * though! |
426 | * @param string $urlStr |
427 | * @return string |
428 | */ |
429 | private static function escapeExtLinkURL( string $urlStr ): string { |
430 | // this regexp is the negation of EXT_LINK_URL_CLASS in the PHP parser |
431 | return preg_replace( |
432 | // IPv6 host names are bracketed with []. Entity-decode these. |
433 | '!^([a-z][^:/]*:)?//[([0-9a-f:.]+)](:\d|/|$)!iD', |
434 | '$1//[$2]$3', |
435 | preg_replace_callback( |
436 | // phpcs:ignore Generic.Files.LineLength.TooLong |
437 | '/[\]\[<>"\x00-\x20\x7F\x{A0}\x{1680}\x{180E}\x{2000}-\x{200A}\x{202F}\x{205F}\x{3000}]|-(?=\{)/u', |
438 | static function ( $m ) { |
439 | return Utils::entityEncodeAll( $m[0] ); |
440 | }, |
441 | $urlStr |
442 | ), |
443 | 1 |
444 | ); |
445 | } |
446 | |
447 | /** |
448 | * Add a colon escape to a wikilink target string if needed. |
449 | * @param Env $env |
450 | * @param string $linkTarget |
451 | * @param stdClass $linkData |
452 | * @return string |
453 | */ |
454 | private static function addColonEscape( |
455 | Env $env, string $linkTarget, stdClass $linkData |
456 | ): string { |
457 | $linkTitle = $env->makeTitleFromText( $linkTarget ); |
458 | $categoryNs = $env->getSiteConfig()->canonicalNamespaceId( 'category' ); |
459 | $fileNs = $env->getSiteConfig()->canonicalNamespaceId( 'file' ); |
460 | |
461 | if ( ( $linkTitle->getNamespace() === $categoryNs || $linkTitle->getNamespace() === $fileNs ) && |
462 | $linkData->type === 'mw:WikiLink' && |
463 | $linkTarget[0] !== ':' ) { |
464 | // Escape category and file links |
465 | return ':' . $linkTarget; |
466 | } else { |
467 | return $linkTarget; |
468 | } |
469 | } |
470 | |
471 | /** |
472 | * Test if something is a URL link |
473 | * @param Env $env |
474 | * @param Element $node |
475 | * @param stdClass $linkData |
476 | * @return bool |
477 | */ |
478 | private static function isURLLink( Env $env, Element $node, stdClass $linkData ): bool { |
479 | $target = $linkData->target; |
480 | |
481 | // Get plain text content, if any |
482 | $contentStr = self::getContentString( $node ); |
483 | |
484 | // First check if we can serialize as an URL link |
485 | return ( $contentStr !== null && $contentStr !== '' ) && |
486 | // Can we minimize this? |
487 | ( $target['value'] === $contentStr || self::getHref( $env, $node ) === $contentStr ) && |
488 | // protocol-relative url links not allowed in text |
489 | // (see autourl rule in peg tokenizer, T32269) |
490 | !str_starts_with( $contentStr, '//' ) && Utils::isProtocolValid( $contentStr, $env ) && |
491 | !self::hasAutoUrlTerminatingChars( $contentStr ); |
492 | } |
493 | |
494 | /** |
495 | * The legacy parser Parser.php::makeFreeExternalLink terminates an autourl when encountering |
496 | * some characters; since we wish to mimic that behaviour we need this method to check whether |
497 | * the provided URL is in that case. |
498 | * @param string $url |
499 | * @return bool |
500 | */ |
501 | private static function hasAutoUrlTerminatingChars( string $url ): bool { |
502 | $sep = TokenizerUtils::getAutoUrlTerminatingChars( strpos( $url, '(' ) !== false ); |
503 | return str_contains( $sep, substr( $url, -1 ) ); |
504 | } |
505 | |
506 | /** |
507 | * Figure out if we need a piped or simple link |
508 | * @param Env $env |
509 | * @param DataParsoid $dp |
510 | * @param array $target |
511 | * @param stdClass $linkData |
512 | * @return bool |
513 | */ |
514 | private static function isSimpleWikiLink( |
515 | Env $env, DataParsoid $dp, array $target, stdClass $linkData |
516 | ): bool { |
517 | $canUseSimple = false; |
518 | $contentString = $linkData->content->string ?? null; |
519 | |
520 | // FIXME (SSS): |
521 | // 1. Revisit this logic to see if all these checks |
522 | // are still relevant or whether this can be simplified somehow. |
523 | // 2. There are also duplicate computations for env.normalizedTitleKey(..) |
524 | // and Util.decodeURIComponent(..) that could be removed. |
525 | // 3. This could potentially be refactored as if-then chains. |
526 | |
527 | // Would need to pipe for any non-string content. |
528 | // Preserve unmodified or non-minimal piped links. |
529 | if ( $contentString !== null && |
530 | ( !empty( $target['modified'] ) || !empty( $linkData->contentModified ) || |
531 | ( $dp->stx ?? null ) !== 'piped' |
532 | ) && |
533 | // Relative links are not simple |
534 | !str_starts_with( $contentString, './' ) |
535 | ) { |
536 | // Strip colon escapes from the original target as that is |
537 | // stripped when deriving the content string. |
538 | // Strip ./ prefixes as well since they are relative link prefixes |
539 | // added to all titles. |
540 | // The prefix stripping, when it occurs, also includes spaces before the prefix. |
541 | // Finally, we also remove trailing spaces because these are removed for <a> links |
542 | // by DOMNormalizer::moveTrailingSpacesOut, and we wouldn't want that to lead to the |
543 | // link getting piped for only that reason. |
544 | $strippedTargetValue = rtrim( |
545 | preg_replace( '#^\s*(:|\./)#', '', $target['value'], 1 ) |
546 | ); |
547 | |
548 | // Strip colon escape after prefix for interwikis |
549 | if ( !empty( $linkData->isInterwiki ) ) { |
550 | $strippedTargetValue = preg_replace( '#^(\w+:):#', '$1', $strippedTargetValue, 1 ); |
551 | } |
552 | |
553 | $decodedTarget = Utils::decodeWtEntities( $strippedTargetValue ); |
554 | // Deal with the protocol-relative link scenario as well |
555 | $hrefHasProto = preg_match( '#^(\w+:)?//#', $linkData->href ); |
556 | |
557 | // Normalize content string and decoded target before comparison. |
558 | // Piped links don't come down this path => it is safe to normalize both. |
559 | $contentString = str_replace( '_', ' ', $contentString ); |
560 | $decodedTarget = str_replace( '_', ' ', $decodedTarget ); |
561 | |
562 | // See if the (normalized) content matches the |
563 | // target, either shadowed or actual. |
564 | $canUseSimple = |
565 | $contentString === $decodedTarget || |
566 | // try wrapped in forward slashes in case they were stripped |
567 | ( '/' . $contentString . '/' ) === $decodedTarget || |
568 | // normalize as titles and compare |
569 | // FIXME: This will strip an interwiki prefix. Is that right? |
570 | $env->normalizedTitleKey( $contentString, true ) |
571 | === preg_replace( self::$MW_TITLE_WHITESPACE_RE, '_', $decodedTarget ) || |
572 | // Relative link |
573 | ( |
574 | ( |
575 | $env->getSiteConfig()->namespaceHasSubpages( |
576 | $env->getContextTitle()->getNamespace() |
577 | ) && |
578 | preg_match( '#^\.\./.*[^/]$#D', $strippedTargetValue ) && |
579 | $contentString === $env->resolveTitle( $strippedTargetValue ) |
580 | ) || |
581 | ( |
582 | preg_match( '#^\.\./.*?/$#D', $strippedTargetValue ) && |
583 | $contentString === preg_replace( '#^(?:\.\./)+(.*?)/$#D', '$1', $strippedTargetValue, 1 ) |
584 | ) |
585 | ) || |
586 | // if content == href this could be a simple link... eg [[Foo]]. |
587 | // but if href is an absolute url with protocol, this won't |
588 | // work: [[http://example.com]] is not a valid simple link! |
589 | ( |
590 | !$hrefHasProto && |
591 | // Always compare against decoded uri because |
592 | // <a rel="mw:WikiLink" href="7%25 Solution">7%25 Solution</a></p> |
593 | // should serialize as [[7% Solution|7%25 Solution]] |
594 | ( |
595 | $contentString === Utils::decodeURIComponent( $linkData->href ) || |
596 | // normalize with underscores for comparison with href |
597 | $env->normalizedTitleKey( $contentString, true ) |
598 | === Utils::decodeURIComponent( $linkData->href ) |
599 | ) |
600 | ); |
601 | } |
602 | |
603 | return $canUseSimple; |
604 | } |
605 | |
606 | /** |
607 | * Serialize as wiki link |
608 | * @param Element $node |
609 | * @param SerializerState $state |
610 | * @param stdClass $linkData |
611 | */ |
612 | private static function serializeAsWikiLink( |
613 | Element $node, SerializerState $state, stdClass $linkData |
614 | ): void { |
615 | $contentParts = null; |
616 | $contentSrc = ''; |
617 | $isPiped = false; |
618 | $needsEscaping = true; |
619 | $env = $state->getEnv(); |
620 | $siteConfig = $env->getSiteConfig(); |
621 | $target = $linkData->target; |
622 | $dp = DOMDataUtils::getDataParsoid( $node ); |
623 | |
624 | // Decode any link that did not come from the source (data-mw/parsoid) |
625 | // Links that come from data-mw/data-parsoid will be true titles, |
626 | // but links that come from hrefs will need to be url-decoded. |
627 | // Ex: <a href="/wiki/A%3Fb">Foobar</a> |
628 | if ( empty( $target['fromsrc'] ) ) { |
629 | // Omit fragments from decoding |
630 | $hash = strpos( $target['value'], '#' ); |
631 | if ( $hash !== false ) { |
632 | $target['value'] = Utils::decodeURIComponent( substr( $target['value'], 0, $hash ) ) |
633 | . substr( $target['value'], $hash ); |
634 | } else { |
635 | $target['value'] = Utils::decodeURIComponent( $target['value'] ); |
636 | } |
637 | } |
638 | |
639 | // Special-case handling for category links |
640 | if ( $linkData->type === 'mw:PageProp/Category' ) { |
641 | // Split target and sort key in $target['value']. |
642 | // The sort key shows up as "#something" in there. |
643 | // However, watch out for parser functions that start with "{{#" |
644 | // The atomic group is essential to prevent "{{#" parser function prefix |
645 | // from getting split at the "{{" and "#" where the "{{" matches the |
646 | // [^#]* and the "#" matches after separately. |
647 | if ( preg_match( '/^((?>{{#|[^#])*)#(.*)/', $target['value'], $targetParts ) ) { |
648 | $target['value'] = strtr( preg_replace( '#^(\.\.?/)*#', '', $targetParts[1], 1 ), '_', ' ' ); |
649 | // FIXME: Reverse `Sanitizer.sanitizeTitleURI(strContent).replace(/#/g, '%23');` |
650 | $strContent = Utils::decodeURIComponent( $targetParts[2] ); |
651 | $contentParts = self::splitLinkContentString( $strContent, $dp ); |
652 | $linkData->content->string = $contentParts->contentString; |
653 | $dp->tail = $linkData->tail = $contentParts->tail; |
654 | $dp->prefix = $linkData->prefix = $contentParts->prefix; |
655 | } else { // No sort key, will serialize to simple link |
656 | // Normalize the content string |
657 | $linkData->content->string = strtr( |
658 | PHPUtils::stripPrefix( $target['value'], './' ), '_', ' ' |
659 | ); |
660 | } |
661 | |
662 | // Special-case handling for template-affected sort keys |
663 | // FIXME: sort keys cannot be modified yet, but if they are, |
664 | // we need to fully shadow the sort key. |
665 | // if ( !target.modified ) { |
666 | // The target and source key was not modified |
667 | $sortKeySrc = $state->serializer->serializedAttrVal( $node, 'mw:sortKey' ); |
668 | if ( isset( $sortKeySrc['value'] ) ) { |
669 | $linkData->contentNode = null; |
670 | $linkData->content->string = $sortKeySrc['value']; |
671 | // TODO: generalize this flag. It is already used by |
672 | // getAttributeShadowInfo. Maybe use the same |
673 | // structure as its return value? |
674 | $linkData->content->fromsrc = true; |
675 | } |
676 | // } |
677 | } else { |
678 | if ( $linkData->type === 'mw:PageProp/Language' ) { |
679 | // Fix up the content string |
680 | // TODO: see if linkData can be cleaner! |
681 | $linkData->content->string ??= Utils::decodeWtEntities( $target['value'] ); |
682 | } |
683 | } |
684 | |
685 | // The string value of the content, if it is plain text. |
686 | $linkTarget = null; |
687 | $escapedTgt = null; |
688 | if ( !empty( $linkData->isRedirect ) ) { |
689 | $linkTarget = $target['value']; |
690 | if ( !empty( $target['modified'] ) || empty( $target['fromsrc'] ) ) { |
691 | $linkTarget = strtr( preg_replace( '#^(\.\.?/)*#', '', $linkTarget, 1 ), '_', ' ' ); |
692 | $escapedTgt = self::escapeLinkTarget( $linkTarget, $state ); |
693 | $linkTarget = $escapedTgt->linkTarget; |
694 | // Determine if it's a redirect to a category, in which case |
695 | // it needs a ':' on front to distingish from a category link. |
696 | if ( preg_match( '/^([^:]+)[:]/', $linkTarget, $categoryMatch ) ) { |
697 | $ns = $siteConfig->namespaceId( Utils::normalizeNamespaceName( $categoryMatch[1] ) ); |
698 | if ( $ns === $siteConfig->canonicalNamespaceId( 'category' ) ) { |
699 | // Check that the next node isn't a category link, |
700 | // in which case we don't want the ':'. |
701 | $nextNode = $node->nextSibling; |
702 | if ( !( |
703 | $nextNode instanceof Element && DOMCompat::nodeName( $nextNode ) === 'link' && |
704 | DOMUtils::hasRel( $nextNode, 'mw:PageProp/Category' ) && |
705 | DOMCompat::getAttribute( $nextNode, 'href' ) === DOMCompat::getAttribute( $node, 'href' ) |
706 | ) ) { |
707 | $linkTarget = ':' . $linkTarget; |
708 | } |
709 | } |
710 | } |
711 | } |
712 | } elseif ( self::isSimpleWikiLink( $env, $dp, $target, $linkData ) ) { |
713 | // Simple case |
714 | if ( empty( $target['modified'] ) && empty( $linkData->contentModified ) ) { |
715 | $linkTarget = PHPUtils::stripPrefix( $target['value'], './' ); |
716 | } else { |
717 | // If token has templated attrs or is a subpage, use target.value |
718 | // since content string will be drastically different. |
719 | if ( WTUtils::hasExpandedAttrsType( $node ) || |
720 | preg_match( '#(^|/)\.\./#', $target['value'] ) |
721 | ) { |
722 | $linkTarget = PHPUtils::stripPrefix( $target['value'], './' ); |
723 | } else { |
724 | $escapedTgt = self::escapeLinkTarget( $linkData->content->string, $state ); |
725 | if ( !$escapedTgt->invalidLink ) { |
726 | $linkTarget = self::addColonEscape( $env, $escapedTgt->linkTarget, $linkData ); |
727 | } else { |
728 | $linkTarget = $escapedTgt->linkTarget; |
729 | } |
730 | } |
731 | if ( !empty( $linkData->isInterwikiLang ) && |
732 | $linkTarget[0] !== ':' && |
733 | $linkData->type !== 'mw:PageProp/Language' |
734 | ) { |
735 | // ensure interwiki links can't be confused with |
736 | // interlanguage links. |
737 | $linkTarget = ':' . $linkTarget; |
738 | } |
739 | } |
740 | } elseif ( self::isURLLink( $state->getEnv(), $node, $linkData ) |
741 | /* && empty( $linkData->isInterwiki ) */ |
742 | ) { |
743 | // Uncomment the above check if we want [[wikipedia:Foo|http://en.wikipedia.org/wiki/Foo]] |
744 | // for '<a href="http://en.wikipedia.org/wiki/Foo">http://en.wikipedia.org/wiki/Foo</a>' |
745 | $linkData->linkType = 'mw:URLLink'; |
746 | } else { |
747 | // Emit piped wikilink syntax |
748 | $isPiped = true; |
749 | |
750 | // First get the content source |
751 | if ( !empty( $linkData->contentNode ) ) { |
752 | $cs = $state->serializeLinkChildrenToString( |
753 | $linkData->contentNode, |
754 | [ $state->serializer->wteHandlers, 'wikilinkHandler' ] |
755 | ); |
756 | // strip off the tail and handle the pipe trick |
757 | $contentParts = self::splitLinkContentString( $cs, $dp ); |
758 | $contentSrc = $contentParts->contentString; |
759 | $dp->tail = $contentParts->tail; |
760 | $linkData->tail = $contentParts->tail; |
761 | $dp->prefix = $contentParts->prefix; |
762 | $linkData->prefix = $contentParts->prefix; |
763 | $needsEscaping = false; |
764 | } else { |
765 | $contentSrc = $linkData->content->string ?? ''; |
766 | $needsEscaping = empty( $linkData->content->fromsrc ); |
767 | } |
768 | |
769 | if ( $contentSrc === '' && $linkData->type !== 'mw:PageProp/Category' ) { |
770 | // Protect empty link content from PST pipe trick |
771 | $contentSrc = '<nowiki/>'; |
772 | $needsEscaping = false; |
773 | } |
774 | |
775 | $linkTarget = $target['value']; |
776 | if ( !empty( $target['modified'] ) || empty( $target['fromsrc'] ) ) { |
777 | // Links starting with ./ shouldn't get _ replaced with ' ' |
778 | $linkContentIsRelative = str_starts_with( $linkData->content->string ?? '', './' ); |
779 | $linkTarget = preg_replace( '#^(\.\.?/)*#', '', $linkTarget, 1 ); |
780 | if ( empty( $linkData->isInterwiki ) && !$linkContentIsRelative ) { |
781 | $linkTarget = strtr( $linkTarget, '_', ' ' ); |
782 | } |
783 | $escapedTgt = self::escapeLinkTarget( $linkTarget, $state ); |
784 | $linkTarget = $escapedTgt->linkTarget; |
785 | } |
786 | |
787 | // If we are reusing the target from source, we don't |
788 | // need to worry about colon-escaping because it will |
789 | // be in the right form already. |
790 | // |
791 | // Trying to eliminate this check and always check for |
792 | // colon-escaping seems a bit tricky when the reused |
793 | // target has encoded entities that won't resolve to |
794 | // valid titles. |
795 | if ( ( !$escapedTgt || !$escapedTgt->invalidLink ) && empty( $target['fromsrc'] ) ) { |
796 | $linkTarget = self::addColonEscape( $env, $linkTarget, $linkData ); |
797 | } |
798 | } |
799 | if ( $linkData->linkType === 'mw:URLLink' ) { |
800 | $state->emitChunk( new AutoURLLinkText( $node->textContent, $node ), $node ); |
801 | return; |
802 | } |
803 | |
804 | if ( !empty( $linkData->isRedirect ) ) { |
805 | // Drop duplicates |
806 | if ( $state->redirectText !== null ) { |
807 | return; |
808 | } |
809 | |
810 | // Buffer redirect text if it is not in start of file position |
811 | if ( !preg_match( self::$REDIRECT_TEST_RE, $state->out . $state->currLine->text ) ) { |
812 | $state->redirectText = $linkData->prefix . '[[' . $linkTarget . ']]'; |
813 | $state->emitChunk( '', $node ); // Flush separators for this node |
814 | // Flush separators for this node |
815 | return; |
816 | } |
817 | |
818 | // Set to some non-null string |
819 | $state->redirectText = 'unbuffered'; |
820 | } |
821 | |
822 | $pipedText = null; |
823 | if ( $escapedTgt && $escapedTgt->invalidLink ) { |
824 | // If the link target was invalid, instead of emitting an invalid link, |
825 | // omit the link and serialize just the content instead. But, log the |
826 | // invalid html for Parsoid clients to investigate later. |
827 | $state->getEnv()->log( |
828 | 'error/html2wt/link', 'Bad title text', DOMCompat::getOuterHTML( $node ) |
829 | ); |
830 | |
831 | // For non-piped content, use the original invalid link text |
832 | $pipedText = $isPiped ? $contentSrc : $linkTarget; |
833 | $state->needsEscaping = $needsEscaping; |
834 | $state->emitChunk( $linkData->prefix . $pipedText . $linkData->tail, $node ); |
835 | } else { |
836 | if ( $isPiped && $needsEscaping ) { |
837 | // We are definitely not in sol context since content |
838 | // will be preceded by "[[" or "[" text in target wikitext. |
839 | $pipedText = '|' . $state->serializer->wteHandlers |
840 | ->escapeLinkContent( $state, $contentSrc, false, $node, false ); |
841 | } elseif ( $isPiped ) { |
842 | $pipedText = '|' . $contentSrc; |
843 | } else { |
844 | $pipedText = ''; |
845 | } |
846 | if ( $isPiped ) { |
847 | $state->singleLineContext->disable(); |
848 | } |
849 | $state->emitChunk( new WikiLinkText( |
850 | $linkData->prefix . '[[' . $linkTarget . $pipedText . ']]' . $linkData->tail, |
851 | $node, $siteConfig, $linkData->type |
852 | ), $node ); |
853 | if ( $isPiped ) { |
854 | $state->singleLineContext->pop(); |
855 | } |
856 | } |
857 | } |
858 | |
859 | /** |
860 | * Serialize as external link |
861 | * @param Element $node |
862 | * @param SerializerState $state |
863 | * @param stdClass $linkData |
864 | */ |
865 | private static function serializeAsExtLink( |
866 | Element $node, SerializerState $state, stdClass $linkData |
867 | ): void { |
868 | $target = $linkData->target; |
869 | $urlStr = $target['value']; |
870 | if ( !empty( $target['modified'] ) || empty( $target['fromsrc'] ) ) { |
871 | // We expect modified hrefs to be percent-encoded already, so |
872 | // don't need to encode them here any more. Unmodified hrefs are |
873 | // just using the original encoding anyway. |
874 | // BUT we do have to encode certain special wikitext |
875 | // characters (like []) which aren't necessarily |
876 | // percent-encoded because they are valid in URLs and HTML5 |
877 | $urlStr = self::escapeExtLinkURL( $urlStr ); |
878 | } |
879 | |
880 | if ( self::isURLLink( $state->getEnv(), $node, $linkData ) ) { |
881 | // Serialize as URL link |
882 | $state->emitChunk( new AutoURLLinkText( $urlStr, $node ), $node ); |
883 | return; |
884 | } |
885 | |
886 | $siteConfig = $state->getEnv()->getSiteConfig(); |
887 | |
888 | $pureHashMatch = substr( $urlStr, 0, 1 ) === '#'; |
889 | // Fully serialize the content |
890 | $contentStr = $state->serializeLinkChildrenToString( |
891 | $node, |
892 | [ $state->serializer->wteHandlers, $pureHashMatch ? 'wikilinkHandler' : 'aHandler' ] |
893 | ); |
894 | |
895 | // serialize as auto-numbered external link |
896 | // [http://example.com] |
897 | $linktext = null; |
898 | $class = null; |
899 | // If it's just anchor text, serialize as an internal link. |
900 | if ( $pureHashMatch ) { |
901 | $class = WikiLinkText::class; |
902 | $linktext = '[[' . $urlStr . ( ( $contentStr ) ? '|' . $contentStr : '' ) . ']]'; |
903 | } else { |
904 | $class = ExtLinkText::class; |
905 | $linktext = '[' . $urlStr . ( ( $contentStr ) ? ' ' . $contentStr : '' ) . ']'; |
906 | } |
907 | $state->emitChunk( new $class( $linktext, $node, $siteConfig, $linkData->type ), $node ); |
908 | } |
909 | |
910 | /** |
911 | * Main link handler. |
912 | * @param SerializerState $state |
913 | * @param Element $node |
914 | */ |
915 | public static function linkHandler( SerializerState $state, Element $node ): void { |
916 | // TODO: handle internal/external links etc using RDFa and dataParsoid |
917 | // Also convert unannotated html links without advanced attributes to |
918 | // external wiki links for html import. Might want to consider converting |
919 | // relative links without path component and file extension to wiki links. |
920 | $env = $state->getEnv(); |
921 | $siteConfig = $env->getSiteConfig(); |
922 | |
923 | // Get the rt data from the token and tplAttrs |
924 | $linkData = self::getLinkRoundTripData( $env, $node, $state ); |
925 | $linkType = $linkData->type; |
926 | // If this could be a magic link, serialize it as a magic link by |
927 | // changing the link type to ExtLink. (If magic links are disabled, then |
928 | // the ExtResourceURLPatternMatcher() will return false.) |
929 | $magicLinkMatch = $siteConfig->getExtResourceURLPatternMatcher()( Utils::decodeURI( $linkData->origHref ) ); |
930 | if ( $magicLinkMatch !== false ) { |
931 | if ( |
932 | $magicLinkMatch[0] === 'PMID' && |
933 | DOMUtils::matchRel( $node, '|^mw:WikiLink/Interwiki\b|' ) !== null && |
934 | $linkType === 'mw:WikiLink' |
935 | ) { |
936 | // Round-trip PMIDs as interwikis if that's how they were |
937 | // originally. (Don't change the link type.) |
938 | } else { |
939 | $contentStr = $state->serializeLinkChildrenToString( |
940 | $node, |
941 | [ $state->serializer->wteHandlers, 'aHandler' ] |
942 | ); |
943 | $serialized = $siteConfig->makeExtResourceURL( |
944 | $magicLinkMatch, $linkData->origHref, $contentStr |
945 | ); |
946 | if ( $serialized[0] !== '[' ) { |
947 | // Successfully serialized as a magic link |
948 | $state->emitChunk( new MagicLinkText( $serialized, $node ), $node ); |
949 | return; |
950 | } |
951 | } |
952 | } |
953 | if ( $linkType !== null && isset( $linkData->target['value'] ) ) { |
954 | // We have a type and target info |
955 | if ( $linkType === 'mw:WikiLink' || $linkType === 'mw:MediaLink' || |
956 | preg_match( TokenUtils::SOL_TRANSPARENT_LINK_REGEX, $linkType ) |
957 | ) { |
958 | // [[..]] links: normal, category, redirect, or lang links |
959 | // (except images) |
960 | self::serializeAsWikiLink( $node, $state, $linkData ); |
961 | return; |
962 | } elseif ( $linkType === 'mw:ExtLink' ) { |
963 | // [..] links, autolinks, ISBN, RFC, PMID |
964 | self::serializeAsExtLink( $node, $state, $linkData ); |
965 | return; |
966 | } else { |
967 | throw new UnexpectedValueException( |
968 | 'Unhandled link serialization scenario: ' . DOMCompat::getOuterHTML( $node ) |
969 | ); |
970 | } |
971 | } else { |
972 | $safeAttr = [ |
973 | 'href' => true, |
974 | 'rel' => true, |
975 | 'class' => true, |
976 | 'title' => true, |
977 | DOMDataUtils::DATA_OBJECT_ATTR_NAME => true |
978 | ]; |
979 | |
980 | $isComplexLink = false; |
981 | foreach ( DOMUtils::attributes( $node ) as $name => $value ) { |
982 | // XXX: Don't drop rel and class in every case once a tags are |
983 | // actually supported in the MW default config? |
984 | if ( !isset( $safeAttr[$name] ) ) { |
985 | $isComplexLink = true; |
986 | break; |
987 | } |
988 | } |
989 | |
990 | if ( $isComplexLink ) { |
991 | $env->log( 'error/html2wt/link', 'Encountered', DOMCompat::getOuterHTML( $node ), |
992 | '-- serializing as extlink and dropping <a> attributes unsupported in wikitext.' |
993 | ); |
994 | } else { |
995 | $media = DOMUtils::selectMediaElt( $node ); // TODO: Handle missing media too |
996 | $isFigure = $media instanceof Element && $media->parentNode === $node; |
997 | if ( $isFigure ) { |
998 | // this is a basic html figure: <a><img></a> |
999 | self::figureHandler( $state, $node, new MediaStructure( $media, $node ) ); |
1000 | return; |
1001 | } |
1002 | } |
1003 | |
1004 | // href is already percent-encoded, etc., but it might contain |
1005 | // spaces or other wikitext nasties. escape the nasties. |
1006 | $hrefStr = self::escapeExtLinkURL( self::getHref( $env, $node ) ); |
1007 | $handler = [ $state->serializer->wteHandlers, 'aHandler' ]; |
1008 | $str = $state->serializeLinkChildrenToString( $node, $handler ); |
1009 | $chunk = null; |
1010 | if ( !$hrefStr ) { |
1011 | // Without an href, we just emit the string as text. |
1012 | // However, to preserve targets for anchor links, |
1013 | // serialize as a span with a name. |
1014 | $name = DOMCompat::getAttribute( $node, 'name' ); |
1015 | if ( $name !== null ) { |
1016 | $doc = $node->ownerDocument; |
1017 | $span = $doc->createElement( 'span' ); |
1018 | $span->setAttribute( 'name', $name ); |
1019 | $span->appendChild( $doc->createTextNode( $str ) ); |
1020 | $chunk = DOMCompat::getOuterHTML( $span ); |
1021 | } else { |
1022 | $chunk = $str; |
1023 | } |
1024 | } else { |
1025 | $chunk = new ExtLinkText( '[' . $hrefStr . ' ' . $str . ']', |
1026 | $node, $siteConfig, 'mw:ExtLink' |
1027 | ); |
1028 | } |
1029 | $state->emitChunk( $chunk, $node ); |
1030 | } |
1031 | } |
1032 | |
1033 | /** |
1034 | * Main figure handler. |
1035 | * |
1036 | * @param SerializerState $state |
1037 | * @param Element $node |
1038 | * @param ?MediaStructure $ms |
1039 | */ |
1040 | public static function figureHandler( |
1041 | SerializerState $state, Element $node, ?MediaStructure $ms |
1042 | ): void { |
1043 | if ( !$ms ) { |
1044 | $state->getEnv()->log( |
1045 | 'error/html2wt/figure', |
1046 | "Couldn't parse media structure: ", |
1047 | DOMCompat::getOuterHTML( $node ) |
1048 | ); |
1049 | return; |
1050 | } |
1051 | $ct = self::figureToConstrainedText( $state, $ms ); |
1052 | $state->emitChunk( $ct ?? '', $node ); |
1053 | } |
1054 | |
1055 | /** |
1056 | * Serialize a figure to contrained text. |
1057 | * |
1058 | * WARN: There's probably more to do to ensure this is purely functional, |
1059 | * no side-effects (ie. calls to state->emit) happen while processing. |
1060 | * |
1061 | * @param SerializerState $state |
1062 | * @param MediaStructure $ms |
1063 | * @return ?ConstrainedText |
1064 | */ |
1065 | public static function figureToConstrainedText( |
1066 | SerializerState $state, MediaStructure $ms |
1067 | ): ?ConstrainedText { |
1068 | $env = $state->getEnv(); |
1069 | $outerElt = $ms->containerElt ?? $ms->mediaElt; |
1070 | $linkElt = $ms->linkElt; |
1071 | $elt = $ms->mediaElt; |
1072 | $captionElt = $ms->captionElt; |
1073 | $format = WTUtils::getMediaFormat( $outerElt ); |
1074 | |
1075 | // Try to identify the local title to use for this image. |
1076 | $resource = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'resource' ); |
1077 | if ( !isset( $resource['value'] ) ) { |
1078 | // from non-parsoid HTML: try to reconstruct resource from src? |
1079 | // (this won't work for manual-thumb images) |
1080 | $src = DOMCompat::getAttribute( $elt, 'src' ); |
1081 | if ( $src === null ) { |
1082 | $env->log( 'error/html2wt/figure', |
1083 | 'In WSP.figureHandler, img does not have resource or src:', |
1084 | DOMCompat::getOuterHTML( $outerElt ) |
1085 | ); |
1086 | return null; |
1087 | } |
1088 | if ( preg_match( '/^https?:/', $src ) ) { |
1089 | // external image link, presumably $wgAllowExternalImages=true |
1090 | return new AutoURLLinkText( $src, $outerElt ); |
1091 | } |
1092 | $resource = [ |
1093 | 'value' => $src, |
1094 | 'fromsrc' => false, |
1095 | 'modified' => false |
1096 | ]; |
1097 | } |
1098 | if ( empty( $resource['fromsrc'] ) ) { |
1099 | $resource['value'] = preg_replace( '#^(\.\.?/)+#', '', $resource['value'], 1 ); |
1100 | } |
1101 | |
1102 | $nopts = []; |
1103 | $outerDP = DOMDataUtils::getDataParsoid( $outerElt ); |
1104 | $outerDMW = DOMDataUtils::getDataMw( $outerElt ); |
1105 | $mwAliases = $state->getEnv()->getSiteConfig()->mwAliases(); |
1106 | |
1107 | // Return ref to the array element in case it is modified |
1108 | $getOpt = static function & ( $key ) use ( &$outerDP ): ?array { |
1109 | $null = null; |
1110 | if ( empty( $outerDP->optList ) ) { |
1111 | return $null; |
1112 | } |
1113 | foreach ( $outerDP->optList as $opt ) { |
1114 | if ( ( $opt['ck'] ?? null ) === $key ) { |
1115 | return $opt; |
1116 | } |
1117 | } |
1118 | return $null; |
1119 | }; |
1120 | // Return ref to the array element in case it is modified |
1121 | $getLastOpt = static function & ( $key ) use ( &$outerDP ): ?array { |
1122 | $null = null; |
1123 | $opts = $outerDP->optList ?? []; |
1124 | for ( $i = count( $opts ) - 1; $i >= 0; $i-- ) { |
1125 | if ( ( $opts[$i]['ck'] ?? null ) === $key ) { |
1126 | return $opts[$i]; |
1127 | } |
1128 | } |
1129 | return $null; |
1130 | }; |
1131 | |
1132 | // Try to identify the local title to use for the link. |
1133 | $link = null; |
1134 | |
1135 | $linkFromDataMw = WTSUtils::getAttrFromDataMw( $outerDMW, 'link', true ); |
1136 | if ( $linkFromDataMw !== null ) { |
1137 | // "link" attribute on the `outerElt` takes precedence |
1138 | if ( isset( $linkFromDataMw->value['html'] ) ) { |
1139 | $link = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, 'link' ); |
1140 | } else { |
1141 | $link = [ |
1142 | 'value' => "link={$linkFromDataMw->value['txt']}", |
1143 | 'modified' => false, |
1144 | 'fromsrc' => false, |
1145 | 'fromDataMW' => true |
1146 | ]; |
1147 | } |
1148 | } elseif ( $linkElt && $linkElt->hasAttribute( 'href' ) ) { |
1149 | $link = $state->serializer->serializedImageAttrVal( $outerElt, $linkElt, 'href' ); |
1150 | if ( empty( $link['fromsrc'] ) ) { |
1151 | // strip page or lang parameter if present on href |
1152 | $strippedHref = preg_replace( |
1153 | '#[?]((?:page=\d+)|(?:lang=[a-z]+(?:-[a-z]+)*))$#Di', |
1154 | '', |
1155 | DOMCompat::getAttribute( $linkElt, 'href' ) ?? '' |
1156 | ); |
1157 | if ( $strippedHref === DOMCompat::getAttribute( $elt, 'resource' ) ) { |
1158 | // default link: same place as resource |
1159 | $link = $resource; |
1160 | } |
1161 | $link['value'] = preg_replace( '#^(\.\.?/)+#', '', $link['value'], 1 ); |
1162 | } |
1163 | } else { |
1164 | // Otherwise, just try and get it from data-mw |
1165 | $link = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, 'href' ); |
1166 | } |
1167 | |
1168 | if ( $link && empty( $link['modified'] ) && empty( $link['fromsrc'] ) ) { |
1169 | $linkOpt = $getOpt( 'link' ); |
1170 | if ( $linkOpt ) { |
1171 | $link['fromsrc'] = true; |
1172 | $link['value'] = $linkOpt['ak']; |
1173 | } |
1174 | } |
1175 | |
1176 | // Reconstruct the caption |
1177 | if ( !$captionElt && is_string( $outerDMW->caption ?? null ) ) { |
1178 | // IMPORTANT: Assign to a variable to prevent the fragment |
1179 | // from getting GCed before we are done with it. |
1180 | $fragment = ContentUtils::createAndLoadDocumentFragment( |
1181 | $outerElt->ownerDocument, $outerDMW->caption, |
1182 | [ 'markNew' => true ] |
1183 | ); |
1184 | // FIXME: We should just be able to serialize the children of the |
1185 | // fragment, however, we need some way of marking this as being |
1186 | // inInsertedContent so that any bare text is assured to be escaped |
1187 | $captionElt = $outerElt->ownerDocument->createElement( 'div' ); |
1188 | DOMDataUtils::getDataParsoid( $captionElt )->setTempFlag( TempData::IS_NEW ); |
1189 | DOMUtils::migrateChildren( $fragment, $captionElt ); |
1190 | // Needs a parent node in order for WTS to be happy |
1191 | $fragment->appendChild( $captionElt ); |
1192 | } |
1193 | |
1194 | $caption = null; |
1195 | if ( $captionElt ) { |
1196 | $caption = $state->serializeCaptionChildrenToString( |
1197 | $captionElt, [ $state->serializer->wteHandlers, 'mediaOptionHandler' ] |
1198 | ); |
1199 | |
1200 | // Alt stuff |
1201 | if ( !WTUtils::hasVisibleCaption( $outerElt ) && $elt->hasAttribute( 'alt' ) ) { |
1202 | $altOnElt = trim( DOMCompat::getAttribute( $elt, 'alt' ) ?? '' ); |
1203 | $altFromCaption = trim( WTUtils::textContentFromCaption( $captionElt ) ); |
1204 | // The first condition is to support an empty \alt=\ option |
1205 | // when no caption is present |
1206 | if ( $altOnElt && ( $altOnElt === $altFromCaption ) ) { |
1207 | $elt->removeAttribute( 'alt' ); |
1208 | } |
1209 | } |
1210 | } |
1211 | |
1212 | // Fetch the alt (if any) |
1213 | $alt = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'alt' ); |
1214 | // Fetch the lang (if any) |
1215 | $lang = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'lang' ); |
1216 | // Fetch the muted (if any) |
1217 | $muted = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'muted' ); |
1218 | // Fetch the loop (if any) |
1219 | $loop = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'loop' ); |
1220 | |
1221 | // Ok, start assembling options, beginning with link & alt & lang |
1222 | // Other media don't have links in output. |
1223 | $linkCond = DOMCompat::nodeName( $elt ) === 'img'; |
1224 | if ( $linkCond && $link ) { |
1225 | // Check whether the link goes to the default place, in which |
1226 | // case an explicit link tag isn't needed. |
1227 | // The link may be external, or may include wikitext template markup, |
1228 | // therefore check first that it parses to a title. |
1229 | $linkTitle = $env->normalizedTitleKey( |
1230 | Utils::decodeURIComponent( $link['value'] ), true |
1231 | ); |
1232 | $resourceTitle = $env->normalizedTitleKey( |
1233 | Utils::decodeURIComponent( $resource['value'] ), true |
1234 | ); |
1235 | if ( |
1236 | $link['value'] === $resource['value'] || |
1237 | ( $linkTitle !== null && $linkTitle === $resourceTitle ) |
1238 | ) { |
1239 | $linkCond = false; // No explicit link attribute needed |
1240 | } |
1241 | } |
1242 | |
1243 | // "alt" for non-image is handle below |
1244 | $altCond = $alt['value'] !== null && DOMCompat::nodeName( $elt ) === 'img'; |
1245 | |
1246 | // This loop handles media options which *mostly* correspond 1-1 with |
1247 | // HTML attributes. `img_$name` is the name of the media option, |
1248 | // and $value is the Parsoid "shadow info" for the attribute. |
1249 | // $cond tells us whether we need to explicitly output this option; |
1250 | // if it is false we are using an implicit default. |
1251 | // `lang` and `alt` are fairly straightforward. `link` |
1252 | // is a little trickier, since we need to massage/fake the shadow |
1253 | // info because it doesn't come *directly* from the attribute. |
1254 | // link comes from the combination of a[href], img[src], and |
1255 | // img[resource], etc; |
1256 | foreach ( [ |
1257 | [ 'name' => 'link', 'value' => $link, 'cond' => $linkCond, 'alias' => 'img_link' ], |
1258 | [ 'name' => 'alt', 'value' => $alt, 'cond' => $altCond, 'alias' => 'img_alt' ], |
1259 | [ 'name' => 'lang', 'value' => $lang, 'cond' => isset( $lang['value'] ), 'alias' => 'img_lang' ], |
1260 | [ 'name' => 'muted', 'value' => $muted, 'cond' => isset( $muted['value'] ), 'alias' => 'timedmedia_muted' ], |
1261 | [ 'name' => 'loop', 'value' => $loop, 'cond' => isset( $loop['value'] ), 'alias' => 'timedmedia_loop' ], |
1262 | ] as $o ) { |
1263 | if ( !$o['cond'] ) { |
1264 | continue; |
1265 | } |
1266 | if ( $o['value'] && !empty( $o['value']['fromsrc'] ) ) { |
1267 | $nopts[] = [ |
1268 | 'ck' => $o['name'], |
1269 | 'ak' => [ $o['value']['value'] ], |
1270 | ]; |
1271 | } else { |
1272 | $value = $o['value'] ? $o['value']['value'] : ''; |
1273 | if ( $o['value'] && in_array( $o['name'], [ 'link', 'alt' ], true ) ) { |
1274 | // see WikiLinkHandler::isWikitextOpt(): link and alt are allowed |
1275 | // to contain arbitrary wikitext, even though it is stripped |
1276 | // to a string before emitting. |
1277 | $value = $state->serializer->wteHandlers->escapeLinkContent( |
1278 | $state, $value, false, $outerElt, true |
1279 | ); |
1280 | } |
1281 | $nopts[] = [ |
1282 | 'ck' => $o['name'], |
1283 | 'v' => $value, |
1284 | 'ak' => $mwAliases[$o['alias']], |
1285 | ]; |
1286 | } |
1287 | } |
1288 | |
1289 | // Now we handle media options which all come from space-separated |
1290 | // values in a single HTML attribute, `class`. (But note that there |
1291 | // can also be "extra" classes added by `img_class` as well.) |
1292 | $classes = DOMCompat::getClassList( $outerElt ); |
1293 | $extra = []; // 'extra' classes |
1294 | $val = null; |
1295 | |
1296 | foreach ( $classes as $c ) { |
1297 | switch ( $c ) { |
1298 | case 'mw-halign-none': |
1299 | case 'mw-halign-right': |
1300 | case 'mw-halign-left': |
1301 | case 'mw-halign-center': |
1302 | $val = substr( $c, 10 ); // strip mw-halign- prefix |
1303 | $nopts[] = [ |
1304 | 'ck' => $val, |
1305 | 'ak' => $mwAliases['img_' . $val], |
1306 | ]; |
1307 | break; |
1308 | |
1309 | case 'mw-valign-top': |
1310 | case 'mw-valign-middle': |
1311 | case 'mw-valign-baseline': |
1312 | case 'mw-valign-sub': |
1313 | case 'mw-valign-super': |
1314 | case 'mw-valign-text-top': |
1315 | case 'mw-valign-bottom': |
1316 | case 'mw-valign-text-bottom': |
1317 | $val = strtr( substr( $c, 10 ), '-', '_' ); // strip mw-valign and '-' to '_' |
1318 | $nopts[] = [ |
1319 | 'ck' => $val, |
1320 | 'ak' => $mwAliases['img_' . $val], |
1321 | ]; |
1322 | break; |
1323 | |
1324 | case 'mw-image-border': |
1325 | $nopts[] = [ |
1326 | 'ck' => 'border', |
1327 | 'ak' => $mwAliases['img_border'], |
1328 | ]; |
1329 | break; |
1330 | |
1331 | case 'mw-default-size': |
1332 | case 'mw-default-audio-height': |
1333 | // handled below |
1334 | break; |
1335 | |
1336 | default: |
1337 | $extra[] = $c; |
1338 | break; |
1339 | } |
1340 | } |
1341 | |
1342 | if ( count( $extra ) ) { |
1343 | $nopts[] = [ |
1344 | 'ck' => 'class', |
1345 | 'v' => implode( ' ', $extra ), |
1346 | 'ak' => $mwAliases['img_class'], |
1347 | ]; |
1348 | } |
1349 | |
1350 | // Now we handle parameters which don't have a representation |
1351 | // as HTML attributes; they are set only from the data-mw |
1352 | // values. (In theory they could perhaps be reverse engineered |
1353 | // from the thumbnail URL, but that would be fragile and expose |
1354 | // thumbnail implementation to the editor so we don't do that.) |
1355 | $mwParams = [ |
1356 | [ 'prop' => 'thumb', 'ck' => 'manualthumb', 'alias' => 'img_manualthumb' ], |
1357 | [ 'prop' => 'page', 'ck' => 'page', 'alias' => 'img_page' ], |
1358 | // Video specific |
1359 | [ 'prop' => 'starttime', 'ck' => 'starttime', 'alias' => 'timedmedia_starttime' ], |
1360 | [ 'prop' => 'endtime', 'ck' => 'endtime', 'alias' => 'timedmedia_endtime' ], |
1361 | [ 'prop' => 'thumbtime', 'ck' => 'thumbtime', 'alias' => 'timedmedia_thumbtime' ] |
1362 | ]; |
1363 | |
1364 | // `img_link` and `img_alt` are only surfaced as HTML attributes |
1365 | // for image media. For all other media we treat them as set only |
1366 | // from data-mw. |
1367 | if ( DOMCompat::nodeName( $elt ) !== 'img' ) { |
1368 | $mwParams[] = [ 'prop' => 'link', 'ck' => 'link', 'alias' => 'img_link' ]; |
1369 | $mwParams[] = [ 'prop' => 'alt', 'ck' => 'alt', 'alias' => 'img_alt' ]; |
1370 | } |
1371 | |
1372 | $hasManualthumb = false; |
1373 | foreach ( $mwParams as $o ) { |
1374 | $v = $outerDMW->{$o['prop']} ?? null; |
1375 | if ( $v === null ) { |
1376 | $a = WTSUtils::getAttrFromDataMw( $outerDMW, $o['ck'], true ); |
1377 | if ( $a !== null ) { |
1378 | if ( isset( $a->value['html'] ) ) { |
1379 | $si = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, $o['ck'] ); |
1380 | if ( isset( $si['value'] ) ) { |
1381 | $nopts[] = [ |
1382 | 'ck' => $o['ck'], |
1383 | 'ak' => [ $si['value'] ], |
1384 | ]; |
1385 | continue; |
1386 | } |
1387 | } else { |
1388 | $v = $a->value['txt']; |
1389 | } |
1390 | } |
1391 | } |
1392 | if ( $v !== null ) { |
1393 | $ak = $state->serializer->getAttributeValue( |
1394 | $outerElt, $o['ck'] |
1395 | ) ?? $mwAliases[$o['alias']]; |
1396 | $nopts[] = [ |
1397 | 'ck' => $o['ck'], |
1398 | 'ak' => $ak, |
1399 | 'v' => $v |
1400 | ]; |
1401 | // Piggyback this here ... |
1402 | if ( $o['prop'] === 'thumb' ) { |
1403 | $hasManualthumb = true; |
1404 | $format = ''; |
1405 | } |
1406 | } |
1407 | } |
1408 | |
1409 | // These media options come from the HTML `typeof` attribute. |
1410 | switch ( $format ) { |
1411 | case 'Thumb': |
1412 | $nopts[] = [ |
1413 | 'ck' => 'thumbnail', |
1414 | 'ak' => $state->serializer->getAttributeValue( |
1415 | $outerElt, 'thumbnail' |
1416 | ) ?? $mwAliases['img_thumbnail'], |
1417 | ]; |
1418 | break; |
1419 | case 'Frame': |
1420 | $nopts[] = [ |
1421 | 'ck' => 'framed', |
1422 | 'ak' => $state->serializer->getAttributeValue( |
1423 | $outerElt, 'framed' |
1424 | ) ?? $mwAliases['img_framed'], |
1425 | ]; |
1426 | break; |
1427 | case 'Frameless': |
1428 | $nopts[] = [ |
1429 | 'ck' => 'frameless', |
1430 | 'ak' => $state->serializer->getAttributeValue( |
1431 | $outerElt, 'frameless' |
1432 | ) ?? $mwAliases['img_frameless'], |
1433 | ]; |
1434 | break; |
1435 | } |
1436 | |
1437 | // Now handle the size-related options. This is complicated! |
1438 | // We consider the `height`, `data-height`, `width`, and |
1439 | // `data-width` attributes, as well as the `typeof` and the `class`. |
1440 | |
1441 | // Get the user-specified height from wikitext |
1442 | $wh = $state->serializer->serializedImageAttrVal( |
1443 | $outerElt, $elt, $ms->isRedLink() ? 'data-height' : 'height' |
1444 | ); |
1445 | // Get the user-specified width from wikitext |
1446 | $ww = $state->serializer->serializedImageAttrVal( |
1447 | $outerElt, $elt, $ms->isRedLink() ? 'data-width' : 'width' |
1448 | ); |
1449 | |
1450 | $sizeUnmodified = !empty( $ww['fromDataMW'] ) || |
1451 | ( empty( $ww['modified'] ) && empty( $wh['modified'] ) ); |
1452 | $upright = $getOpt( 'upright' ); |
1453 | |
1454 | // XXX: Infer upright factor from default size for all thumbs by default? |
1455 | // Better for scaling with user prefs, but requires knowledge about |
1456 | // default used in VE. |
1457 | if ( $sizeUnmodified && $upright && |
1458 | // Only serialize upright where it is actually respected |
1459 | // This causes some dirty diffs, but makes sure that we don't |
1460 | // produce nonsensical output after a type switch. |
1461 | // TODO: Only strip if type was actually modified. |
1462 | in_array( $format, [ 'Frameless', 'Thumb' ], true ) |
1463 | ) { |
1464 | // preserve upright option |
1465 | $nopts[] = [ |
1466 | 'ck' => $upright['ck'], |
1467 | 'ak' => [ $upright['ak'] ], // FIXME: don't use ak here! |
1468 | ]; |
1469 | } |
1470 | |
1471 | if ( |
1472 | !DOMUtils::hasClass( $outerElt, 'mw-default-size' ) && |
1473 | $format !== 'Frame' && !$hasManualthumb |
1474 | ) { |
1475 | $size = $getLastOpt( 'width' ); |
1476 | $sizeString = (string)( $size['ak'] ?? '' ); |
1477 | if ( $sizeString === '' && !empty( $ww['fromDataMW'] ) ) { |
1478 | $sizeString = (string)( $ww['value'] ?? '' ); |
1479 | } |
1480 | if ( $sizeUnmodified && $sizeString !== '' ) { |
1481 | // preserve original width/height string if not touched |
1482 | $nopts[] = [ |
1483 | 'ck' => 'width', |
1484 | 'v' => $sizeString, // original size string |
1485 | 'ak' => [ '$1' ], // don't add px or the like |
1486 | ]; |
1487 | } else { |
1488 | $bbox = null; |
1489 | // Serialize to a square bounding box |
1490 | if ( isset( $ww['value'] ) && preg_match( '/^\d+/', $ww['value'] ) ) { |
1491 | $bbox = intval( $ww['value'] ); |
1492 | } |
1493 | if ( isset( $wh['value'] ) && preg_match( '/^\d+/', $wh['value'] ) && |
1494 | // As with "mw-default-size", editing clients should remove the |
1495 | // "mw-default-audio-height" if they want to factor a defined |
1496 | // height into the bounding box size. However, note that, at |
1497 | // present, a defined height for audio is ignored while parsing, |
1498 | // so this only has the effect of modifying the width. |
1499 | ( |
1500 | DOMCompat::nodeName( $elt ) !== 'audio' || |
1501 | !DOMUtils::hasClass( $outerElt, 'mw-default-audio-height' ) |
1502 | ) |
1503 | ) { |
1504 | $height = intval( $wh['value'] ); |
1505 | if ( $bbox === null || $height > $bbox ) { |
1506 | $bbox = $height; |
1507 | } |
1508 | } |
1509 | if ( $bbox !== null ) { |
1510 | $nopts[] = [ |
1511 | 'ck' => 'width', |
1512 | // MediaWiki interprets 100px as a width |
1513 | // restriction only, so we need to make the bounding |
1514 | // box explicitly square (100x100px). The 'px' is |
1515 | // added by the alias though, and can be localized. |
1516 | 'v' => $bbox . 'x' . $bbox, |
1517 | 'ak' => $mwAliases['img_width'], // adds the 'px' suffix |
1518 | ]; |
1519 | } |
1520 | } |
1521 | } |
1522 | |
1523 | $opts = $outerDP->optList ?? []; // original wikitext options |
1524 | |
1525 | // Add bogus options from old optlist in order to round-trip cleanly (T64500) |
1526 | foreach ( $opts as $o ) { |
1527 | if ( ( $o['ck'] ?? null ) === 'bogus' ) { |
1528 | $nopts[] = [ |
1529 | 'ck' => 'bogus', |
1530 | 'ak' => [ $o['ak'] ], |
1531 | ]; |
1532 | } |
1533 | } |
1534 | |
1535 | // Put the caption last, by default. |
1536 | if ( is_string( $caption ) ) { |
1537 | $nopts[] = [ |
1538 | 'ck' => 'caption', |
1539 | 'ak' => [ $caption ], |
1540 | ]; |
1541 | } |
1542 | |
1543 | // ok, sort the new options to match the order given in the old optlist |
1544 | // and try to match up the aliases used |
1545 | $changed = false; |
1546 | foreach ( $nopts as &$no ) { |
1547 | // Make sure we have an array here. Default in data-parsoid is |
1548 | // actually a string. |
1549 | // FIXME: don't reuse ak for two different things! |
1550 | if ( !is_array( $no['ak'] ) ) { |
1551 | $no['ak'] = [ $no['ak'] ]; |
1552 | } |
1553 | |
1554 | $no['sortId'] = count( $opts ); |
1555 | $idx = -1; |
1556 | foreach ( $opts as $i => $o ) { |
1557 | if ( ( $o['ck'] ?? null ) === $no['ck'] && |
1558 | // for bogus options, make sure the source matches too. |
1559 | ( $o['ck'] !== 'bogus' || $o['ak'] === $no['ak'][0] ) |
1560 | ) { |
1561 | $idx = $i; |
1562 | break; |
1563 | } |
1564 | } |
1565 | if ( $idx < 0 ) { |
1566 | // Preferred words are first in the alias list |
1567 | // (but not in old versions of mediawiki). |
1568 | $no['ak'] = $no['ak'][0]; |
1569 | $changed = true; |
1570 | continue; |
1571 | } |
1572 | |
1573 | $no['sortId'] = $idx; |
1574 | // use a matching alias, if there is one |
1575 | $a = null; |
1576 | foreach ( $no['ak'] as $b ) { |
1577 | // note the trim() here; that allows us to snarf eccentric |
1578 | // whitespace from the original option wikitext |
1579 | $b2 = $b; |
1580 | if ( isset( $no['v'] ) ) { |
1581 | $b2 = str_replace( '$1', $no['v'], $b ); |
1582 | } |
1583 | if ( $b2 === trim( implode( ',', (array)$opts[$idx]['ak'] ) ) ) { |
1584 | $a = $b; |
1585 | break; |
1586 | } |
1587 | } |
1588 | // use the alias (incl whitespace) from the original option wikitext |
1589 | // if found; otherwise use the last alias given (English default by |
1590 | // convention that works everywhere). |
1591 | // TODO: use first alias (localized) instead for RTL languages (T53852) |
1592 | if ( $a !== null && $no['ck'] !== 'caption' ) { |
1593 | $no['ak'] = $opts[$idx]['ak']; |
1594 | unset( $no['v'] ); // prevent double substitution |
1595 | } else { |
1596 | $no['ak'] = PHPUtils::lastItem( $no['ak'] ); |
1597 | if ( !( $no['ck'] === 'caption' && $a !== null ) ) { |
1598 | $changed = true; |
1599 | } |
1600 | } |
1601 | } |
1602 | |
1603 | // Filter out bogus options if the image options/caption have changed. |
1604 | if ( $changed ) { |
1605 | $nopts = array_filter( $nopts, static function ( $no ) { |
1606 | return $no['ck'] !== 'bogus'; |
1607 | } ); |
1608 | // empty captions should get filtered out in this case, too (T64264) |
1609 | $nopts = array_filter( $nopts, static function ( $no ) { |
1610 | return !( $no['ck'] === 'caption' && $no['ak'] === '' ); |
1611 | } ); |
1612 | } |
1613 | |
1614 | // sort! |
1615 | usort( $nopts, static function ( $a, $b ) { |
1616 | return $a['sortId'] <=> $b['sortId']; |
1617 | } ); |
1618 | |
1619 | // emit all the options as wikitext! |
1620 | $wikitext = '[[' . $resource['value']; |
1621 | foreach ( $nopts as $o ) { |
1622 | $wikitext .= '|'; |
1623 | if ( isset( $o['v'] ) ) { |
1624 | $wikitext .= str_replace( '$1', $o['v'], $o['ak'] ); |
1625 | } else { |
1626 | $wikitext .= $o['ak']; |
1627 | } |
1628 | } |
1629 | $wikitext .= ']]'; |
1630 | |
1631 | return new WikiLinkText( |
1632 | $wikitext, $outerElt, $state->getEnv()->getSiteConfig(), 'mw:File' |
1633 | ); |
1634 | } |
1635 | |
1636 | } |