Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 817 |
|
0.00% |
0 / 16 |
CRAP | |
0.00% |
0 / 1 |
LinkHandlerUtils | |
0.00% |
0 / 817 |
|
0.00% |
0 / 16 |
89102 | |
0.00% |
0 / 1 |
splitLinkContentString | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
30 | |||
getHref | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
56 | |||
normalizeIWP | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
escapeLinkTarget | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
getContentString | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
42 | |||
getLinkRoundTripData | |
0.00% |
0 / 130 |
|
0.00% |
0 / 1 |
3192 | |||
escapeExtLinkURL | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
addColonEscape | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
isURLLink | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
56 | |||
hasAutoUrlTerminatingChars | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
isSimpleWikiLink | |
0.00% |
0 / 49 |
|
0.00% |
0 / 1 |
306 | |||
serializeAsWikiLink | |
0.00% |
0 / 126 |
|
0.00% |
0 / 1 |
2352 | |||
serializeAsExtLink | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
72 | |||
linkHandler | |
0.00% |
0 / 68 |
|
0.00% |
0 / 1 |
380 | |||
figureHandler | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
figureToConstrainedText | |
0.00% |
0 / 339 |
|
0.00% |
0 / 1 |
12882 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use stdClass; |
7 | use UnexpectedValueException; |
8 | use Wikimedia\Parsoid\Config\Env; |
9 | use Wikimedia\Parsoid\Core\MediaStructure; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\DOM\Node; |
12 | use Wikimedia\Parsoid\DOM\Text; |
13 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\AutoURLLinkText; |
14 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ConstrainedText; |
15 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ExtLinkText; |
16 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\MagicLinkText; |
17 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\WikiLinkText; |
18 | use Wikimedia\Parsoid\Html2Wt\DOMHandlers\FallbackHTMLHandler; |
19 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
20 | use Wikimedia\Parsoid\NodeData\TempData; |
21 | use Wikimedia\Parsoid\Utils\DOMCompat; |
22 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
23 | use Wikimedia\Parsoid\Utils\DOMUtils; |
24 | use Wikimedia\Parsoid\Utils\PHPUtils; |
25 | use Wikimedia\Parsoid\Utils\TokenUtils; |
26 | use Wikimedia\Parsoid\Utils\UrlUtils; |
27 | use Wikimedia\Parsoid\Utils\Utils; |
28 | use Wikimedia\Parsoid\Utils\WTUtils; |
29 | use Wikimedia\Parsoid\Wt2Html\TokenizerUtils; |
30 | |
31 | /** |
32 | * Serializes link markup. |
33 | */ |
34 | class LinkHandlerUtils { |
35 | private const REDIRECT_TEST_RE = '/^([ \t\n\r\0\x0b])*$/D'; |
36 | private const MW_TITLE_WHITESPACE_RE |
37 | = '/[ _\xA0\x{1680}\x{180E}\x{2000}-\x{200A}\x{2028}\x{2029}\x{202F}\x{205F}\x{3000}]+/u'; |
38 | |
39 | /** |
40 | * Split a string based on a prefix and suffix |
41 | * |
42 | * @param string $contentString |
43 | * @param DataParsoid $dp Containing ->prefix and ->tail |
44 | * @return stdClass |
45 | */ |
46 | private static function splitLinkContentString( string $contentString, DataParsoid $dp ): stdClass { |
47 | $tail = $dp->tail ?? ''; |
48 | $prefix = $dp->prefix ?? ''; |
49 | |
50 | $tailLen = strlen( $tail ); |
51 | if ( $tailLen && substr( $contentString, -$tailLen ) === $tail ) { |
52 | // strip the tail off the content |
53 | $contentString = substr( $contentString, 0, -$tailLen ); |
54 | } else { |
55 | $tail = ''; |
56 | } |
57 | |
58 | $prefixLen = strlen( $prefix ); |
59 | if ( $prefixLen && substr( $contentString, 0, $prefixLen ) === $prefix ) { |
60 | $contentString = substr( $contentString, $prefixLen ); |
61 | } else { |
62 | $prefix = ''; |
63 | } |
64 | |
65 | return (object)[ |
66 | 'contentString' => $contentString, |
67 | 'tail' => $tail, |
68 | 'prefix' => $prefix, |
69 | ]; |
70 | } |
71 | |
72 | /** |
73 | * Helper function for munging protocol-less absolute URLs: |
74 | * If this URL is absolute, but doesn't contain a protocol, |
75 | * try to find a localinterwiki protocol that would work. |
76 | * |
77 | * @param Env $env |
78 | * @param Element $node |
79 | * @return string |
80 | */ |
81 | private static function getHref( Env $env, Element $node ): string { |
82 | $href = DOMCompat::getAttribute( $node, 'href' ) ?? ''; |
83 | if ( ( $href[0] ?? '' ) === '/' && ( $href[1] ?? '' ) !== '/' ) { |
84 | // protocol-less but absolute. let's find a base href |
85 | foreach ( $env->getSiteConfig()->interwikiMapNoNamespaces() as $interwikiInfo ) { |
86 | if ( isset( $interwikiInfo['localinterwiki'] ) && isset( $interwikiInfo['url'] ) ) { |
87 | $base = $interwikiInfo['url']; |
88 | |
89 | // evaluate the url relative to this base |
90 | $nhref = UrlUtils::expandUrl( $href, $base ); |
91 | |
92 | // can this match the pattern? |
93 | $re = '/^' . strtr( preg_quote( $base, '/' ), [ '\\$1' => '.*' ] ) . '$/sD'; |
94 | if ( preg_match( $re, $nhref ) ) { |
95 | return $nhref; |
96 | } |
97 | } |
98 | } |
99 | } |
100 | return $href; |
101 | } |
102 | |
103 | /** |
104 | * Normalize an interwiki prefix (?) |
105 | * @param string $str |
106 | * @return string |
107 | */ |
108 | private static function normalizeIWP( string $str ): string { |
109 | return PHPUtils::stripPrefix( trim( strtolower( $str ) ), ':' ); |
110 | } |
111 | |
112 | /** |
113 | * Escape a link target, and indicate if it's valid |
114 | * @param string $linkTarget |
115 | * @param SerializerState $state |
116 | * @return stdClass |
117 | */ |
118 | private static function escapeLinkTarget( string $linkTarget, SerializerState $state ): stdClass { |
119 | // Entity-escape the content. |
120 | $linkTarget = Utils::escapeWtEntities( $linkTarget ); |
121 | return (object)[ |
122 | 'linkTarget' => $linkTarget, |
123 | // Is this an invalid link? |
124 | 'invalidLink' => !$state->getEnv()->isValidLinkTarget( $linkTarget ) || |
125 | // `isValidLinkTarget` omits fragments (the part after #) so, |
126 | // even though "|" is an invalid character, we still need to ensure |
127 | // it doesn't appear in there. The percent encoded version is fine |
128 | // in the fragment, since it won't break the parse. |
129 | strpos( $linkTarget, '|' ) !== false, |
130 | ]; |
131 | } |
132 | |
133 | /** |
134 | * Get the plain text content of the node, if it can be represented as such |
135 | * |
136 | * NOTE: This function seems a little inconsistent about what's considered |
137 | * null and what's an empty string. For example, no children is null |
138 | * but a single diffMarker gets a string? One of the current callers |
139 | * seems to subtly depend on that though. |
140 | * |
141 | * FIXME(T254501): This function can return `$node->textContent` instead |
142 | * of the string concatenation once mw:DisplaySpace is preprocessed away. |
143 | * |
144 | * @param Node $node |
145 | * @return ?string |
146 | */ |
147 | private static function getContentString( Node $node ): ?string { |
148 | if ( !$node->hasChildNodes() ) { |
149 | return null; |
150 | } |
151 | $contentString = ''; |
152 | $child = $node->firstChild; |
153 | while ( $child ) { |
154 | if ( $child instanceof Text ) { |
155 | $contentString .= $child->nodeValue; |
156 | } elseif ( DOMUtils::hasTypeOf( $child, 'mw:DisplaySpace' ) ) { |
157 | $contentString .= ' '; |
158 | } elseif ( DiffUtils::isDiffMarker( $child ) ) { |
159 | } else { |
160 | return null; |
161 | } |
162 | $child = $child->nextSibling; |
163 | } |
164 | return $contentString; |
165 | } |
166 | |
167 | /** |
168 | * Helper function for getting RT data from the tokens |
169 | * @param Env $env |
170 | * @param Element $node |
171 | * @param SerializerState $state |
172 | * @return stdClass |
173 | */ |
174 | private static function getLinkRoundTripData( |
175 | Env $env, Element $node, SerializerState $state |
176 | ): stdClass { |
177 | $dp = DOMDataUtils::getDataParsoid( $node ); |
178 | $siteConfig = $env->getSiteConfig(); |
179 | $rtData = (object)[ |
180 | 'type' => null, // could be null |
181 | 'href' => null, // filled in below |
182 | 'origHref' => null, // filled in below |
183 | 'target' => null, // filled in below |
184 | 'tail' => $dp->tail ?? '', |
185 | 'prefix' => $dp->prefix ?? '', |
186 | 'linkType' => null |
187 | ]; |
188 | $rtData->content = new stdClass; |
189 | $isIW = false; |
190 | |
191 | // Figure out the type of the link |
192 | if ( $node->hasAttribute( 'rel' ) ) { |
193 | $rel = DOMCompat::getAttribute( $node, 'rel' ) ?? ''; |
194 | // Parsoid only emits and recognizes ExtLink, WikiLink, MediaLink and PageProp rel values. |
195 | // Everything else defaults to ExtLink during serialization (unless it is |
196 | // serializable to a wikilink) |
197 | // We're keeping the preg_match here instead of going through DOMUtils::matchRel |
198 | // because we have \b guards to handle the multivalue, and we're keeping the matches, |
199 | // which matchRel doesn't do. |
200 | if ( preg_match( '/\b(mw:(WikiLink|ExtLink|MediaLink|PageProp)(\S*))\b/', $rel, $typeMatch ) ) { |
201 | $rtData->type = $typeMatch[1]; |
202 | // Strip link subtype info |
203 | if ( $typeMatch[2] === 'WikiLink' || $typeMatch[2] === 'ExtLink' ) { |
204 | $rtData->type = 'mw:' . $typeMatch[2]; |
205 | } |
206 | $isIW = ( |
207 | ( $typeMatch[2] === 'WikiLink' && ( $typeMatch[3] ?? '' ) === '/Interwiki' ) || |
208 | // TODO: Remove this when we no longer have to worry about Flow boards |
209 | ( $typeMatch[2] === 'ExtLink' && ( $dp->isIW ?? false ) ) |
210 | ); |
211 | } |
212 | } |
213 | |
214 | // Default link type if nothing else is set |
215 | if ( $rtData->type === null && !DOMUtils::selectMediaElt( $node ) ) { |
216 | $rtData->type = 'mw:ExtLink'; |
217 | } |
218 | |
219 | // Get href, and save the token's "real" href for comparison |
220 | $href = self::getHref( $env, $node ); |
221 | $rtData->origHref = $href; |
222 | $rtData->href = preg_replace( '#^(\.\.?/)+#', '', $href, 1 ); |
223 | |
224 | // WikiLinks should be relative (but see below); fixup the link type |
225 | // if a WikiLink has an absolute URL. |
226 | // (This may get converted back to a WikiLink below, in the interwiki |
227 | // handling code.) |
228 | if ( $rtData->type === 'mw:WikiLink' && |
229 | ( preg_match( '#^(\w+:)?//#', $rtData->href ) || |
230 | substr( $rtData->origHref ?? '', 0, 1 ) === '/' ) |
231 | ) { |
232 | $rtData->type = 'mw:ExtLink'; |
233 | } |
234 | |
235 | // Now get the target from rt data |
236 | $rtData->target = $state->serializer->serializedAttrVal( $node, 'href' ); |
237 | |
238 | // Check if the link content has been modified or is newly inserted content. |
239 | // FIXME: This will only work with selser of course. Hard to test without selser. |
240 | if ( |
241 | $state->inInsertedContent || |
242 | DiffUtils::hasDiffMark( $node, DiffMarkers::SUBTREE_CHANGED ) |
243 | ) { |
244 | $rtData->contentModified = true; |
245 | } |
246 | |
247 | // Get the content string or tokens |
248 | $contentString = self::getContentString( $node ); |
249 | if ( $contentString !== null ) { |
250 | if ( !empty( $rtData->target['value'] ) && $rtData->target['value'] !== $contentString ) { |
251 | // Try to identify a new potential tail |
252 | $contentParts = self::splitLinkContentString( $contentString, $dp ); |
253 | $rtData->content->string = $contentParts->contentString; |
254 | $rtData->tail = $contentParts->tail; |
255 | $rtData->prefix = $contentParts->prefix; |
256 | } else { |
257 | $rtData->tail = ''; |
258 | $rtData->prefix = ''; |
259 | $rtData->content->string = $contentString; |
260 | } |
261 | } elseif ( $node->hasChildNodes() ) { |
262 | $rtData->contentNode = $node; |
263 | } elseif ( $rtData->type === 'mw:PageProp/redirect' ) { |
264 | $rtData->isRedirect = true; |
265 | $rtData->prefix = $dp->src |
266 | ?? ( ( $siteConfig->mwAliases()['redirect'][0] ?? '#REDIRECT' ) . ' ' ); |
267 | } |
268 | |
269 | // Update link type based on additional analysis. |
270 | // What might look like external links might be serializable as a wikilink. |
271 | $target = &$rtData->target; |
272 | |
273 | // mw:MediaLink annotations are considered authoritative |
274 | // and interwiki link matches aren't made for these |
275 | if ( $rtData->type === 'mw:MediaLink' ) { |
276 | // Parse title from resource attribute (see analog in image handling) |
277 | $resource = $state->serializer->serializedAttrVal( $node, 'resource' ); |
278 | if ( $resource['value'] === null ) { |
279 | // from non-parsoid HTML: try to reconstruct resource from href? |
280 | // (See similar code which tries to guess resource from <img src>) |
281 | $mediaPrefix = $siteConfig->namespaceName( $siteConfig->namespaceId( 'media' ) ); |
282 | $slashPos = strrpos( $rtData->origHref, '/' ); |
283 | $fileName = $slashPos === false ? $rtData->origHref : |
284 | substr( $rtData->origHref, $slashPos + 1 ); |
285 | $resource = [ |
286 | 'value' => $mediaPrefix . ':' . $fileName, |
287 | 'fromsrc' => false, |
288 | 'modified' => false |
289 | ]; |
290 | } |
291 | $rtData->target = $resource; |
292 | $rtData->href = preg_replace( '#^(\.\.?/)+#', '', $rtData->target['value'], 1 ); |
293 | return $rtData; |
294 | } |
295 | |
296 | // Check if the href matches any of our interwiki URL patterns |
297 | $interwikiMatch = $siteConfig->interwikiMatcher( $href ); |
298 | if ( !$interwikiMatch ) { |
299 | if ( $isIW ) { |
300 | // If this is an interwiki but we can't find it then ignore the |
301 | // data-parsoid href (which is proably just the interwiki link again) |
302 | // and use the href from the <a> tag |
303 | $rtData->target = DOMCompat::getAttribute( $node, 'href' ); |
304 | } |
305 | return $rtData; |
306 | } |
307 | |
308 | $iw = $siteConfig->interwikiMapNoNamespaces()[ltrim( $interwikiMatch[0], ':' )]; |
309 | $localInterwiki = !empty( $iw['local'] ); |
310 | |
311 | // Only to be used in question mark check, since other checks want to include the fragment |
312 | $targetForQmarkCheck = $interwikiMatch[1]; |
313 | // FIXME: If ever the default value for $wgExternalInterwikiFragmentMode |
314 | // changes, we can reduce this by always stripping off the fragment |
315 | // identifier, since in "html5" mode, that isn't encoded. At present, |
316 | // we can only do that if we know it's a local interwiki link. |
317 | if ( $localInterwiki ) { |
318 | $withoutFragment = strstr( $targetForQmarkCheck, '#', true ); |
319 | if ( $withoutFragment !== false ) { |
320 | $targetForQmarkCheck = $withoutFragment; |
321 | } |
322 | } |
323 | |
324 | if ( |
325 | // Question mark is a valid title char, so it won't fail the test below, |
326 | // but gets percent encoded on the way out since it has special |
327 | // semantics in a url. That will break the url we're serializing, so |
328 | // protect it. |
329 | strpos( $targetForQmarkCheck, '?' ) === false && |
330 | // Ensure we have a valid link target, otherwise falling back to extlink |
331 | // is preferable, since it won't serialize as a link. |
332 | ( |
333 | $interwikiMatch[1] === '' || !self::escapeLinkTarget( |
334 | // Append the prefix since we want to validate the target |
335 | // with respect to it being an interwiki. |
336 | $interwikiMatch[0] . ':' . $interwikiMatch[1], |
337 | $state |
338 | )->invalidLink |
339 | ) && |
340 | // ExtLinks should have content to convert. |
341 | ( |
342 | $rtData->type !== 'mw:ExtLink' || |
343 | !empty( $rtData->content->string ) || |
344 | !empty( $rtData->contentNode ) |
345 | ) && |
346 | ( $isIW || !empty( $target['modified'] ) || !empty( $rtData->contentModified ) ) |
347 | ) { |
348 | // External link that is really an interwiki link. Convert it. |
349 | if ( $rtData->type === 'mw:ExtLink' ) { |
350 | $rtData->type = 'mw:WikiLink'; |
351 | } |
352 | $rtData->isInterwiki = true; |
353 | $iwMap = $siteConfig->interwikiMapNoNamespaces(); |
354 | // could this be confused with a language link? |
355 | $iwi = $iwMap[self::normalizeIWP( $interwikiMatch[0] )] ?? null; |
356 | $rtData->isInterwikiLang = $iwi && isset( $iwi['language'] ); |
357 | // is this our own wiki? |
358 | $rtData->isLocal = $iwi && isset( $iwi['localinterwiki'] ); |
359 | // strip off localinterwiki prefixes |
360 | $localPrefix = ''; |
361 | $oldPrefix = null; |
362 | while ( true ) { |
363 | $tmp = substr( $target['value'], strlen( $localPrefix ) ); |
364 | if ( !preg_match( '/^(:?([^:]+)):/', $tmp, $oldPrefix ) ) { |
365 | break; |
366 | } |
367 | $iwi = $iwMap[Utils::normalizeNamespaceName( $oldPrefix[2] )] ?? null; |
368 | if ( !$iwi || !isset( $iwi['localinterwiki'] ) ) { |
369 | break; |
370 | } |
371 | $localPrefix .= $oldPrefix[1] . ':'; |
372 | } |
373 | |
374 | if ( !empty( $target['fromsrc'] ) && empty( $target['modified'] ) ) { |
375 | // Leave the target alone! |
376 | } else { |
377 | if ( $rtData->type === 'mw:PageProp/Language' ) { |
378 | $targetValue = implode( ':', $interwikiMatch ); |
379 | // Strip initial colon |
380 | if ( $targetValue[0] === ':' ) { |
381 | $targetValue = substr( $targetValue, 1 ); |
382 | } |
383 | $target['value'] = $targetValue; |
384 | } elseif ( |
385 | $oldPrefix && ( // Should we preserve the old prefix? |
386 | strcasecmp( $oldPrefix[1], $interwikiMatch[0] ) === 0 || |
387 | // Check if the old prefix mapped to the same URL as |
388 | // the new one. Use the old one if that's the case. |
389 | // Example: [[w:Foo]] vs. [[:en:Foo]] |
390 | ( $iwMap[self::normalizeIWP( $oldPrefix[1] )]['url'] ?? null ) |
391 | === ( $iwMap[self::normalizeIWP( $interwikiMatch[0] )]['url'] ?? null ) |
392 | ) |
393 | ) { |
394 | // Reuse old prefix capitalization |
395 | if ( Utils::decodeWtEntities( substr( $target['value'], strlen( $oldPrefix[1] ) + 1 ) ) |
396 | !== $interwikiMatch[1] |
397 | ) { |
398 | // Modified, update target.value. |
399 | $target['value'] = $localPrefix . $oldPrefix[1] . ':' . $interwikiMatch[1]; |
400 | } |
401 | // Ensure that we generate an interwiki link and not a language link! |
402 | if ( $rtData->isInterwikiLang && $target['value'][0] !== ':' ) { |
403 | $target['value'] = ':' . $target['value']; |
404 | } |
405 | } else { // Else: preserve old encoding |
406 | if ( !empty( $rtData->isLocal ) ) { |
407 | // - interwikiMatch[0] will be something like ":en" or "w" |
408 | // - This tests whether the interwiki-like link is actually |
409 | // a local wikilink. |
410 | |
411 | $target['value'] = $interwikiMatch[1]; |
412 | // interwikiMatch[1] may start with a language link prefix, |
413 | // ensure that we generate interwiki link syntax in that case. (T292022) |
414 | if ( |
415 | preg_match( '/^([^:]+):/', $target['value'], $match ) && |
416 | !empty( $iwMap[self::normalizeIWP( $match[1] )]['language'] ) |
417 | ) { |
418 | $target['value'] = ':' . $target['value']; |
419 | } |
420 | |
421 | $rtData->isInterwiki = $rtData->isInterwikiLang = false; |
422 | } else { |
423 | $target['value'] = implode( ':', $interwikiMatch ); |
424 | } |
425 | } |
426 | } |
427 | } |
428 | |
429 | return $rtData; |
430 | } |
431 | |
432 | /** |
433 | * The provided URL is already percent-encoded -- but it may still |
434 | * not be safe for wikitext. Add additional escapes to make the URL |
435 | * wikitext-safe. Don't touch percent escapes already in the url, |
436 | * though! |
437 | * @param string $urlStr |
438 | * @return string |
439 | */ |
440 | private static function escapeExtLinkURL( string $urlStr ): string { |
441 | // this regexp is the negation of EXT_LINK_URL_CLASS in the PHP parser |
442 | return preg_replace( |
443 | // IPv6 host names are bracketed with []. Entity-decode these. |
444 | '!^([a-z][^:/]*:)?//[([0-9a-f:.]+)](:\d|/|$)!iD', |
445 | '$1//[$2]$3', |
446 | preg_replace_callback( |
447 | // phpcs:ignore Generic.Files.LineLength.TooLong |
448 | '/[\]\[<>"\x00-\x20\x7F\x{A0}\x{1680}\x{180E}\x{2000}-\x{200A}\x{202F}\x{205F}\x{3000}]|-(?=\{)/u', |
449 | static function ( $m ) { |
450 | return Utils::entityEncodeAll( $m[0] ); |
451 | }, |
452 | $urlStr |
453 | ), |
454 | 1 |
455 | ); |
456 | } |
457 | |
458 | /** |
459 | * Add a colon escape to a wikilink target string if needed. |
460 | * @param Env $env |
461 | * @param string $linkTarget |
462 | * @param stdClass $linkData |
463 | * @return string |
464 | */ |
465 | private static function addColonEscape( |
466 | Env $env, string $linkTarget, stdClass $linkData |
467 | ): string { |
468 | $linkTitle = $env->makeTitleFromText( $linkTarget ); |
469 | $categoryNs = $env->getSiteConfig()->canonicalNamespaceId( 'category' ); |
470 | $fileNs = $env->getSiteConfig()->canonicalNamespaceId( 'file' ); |
471 | |
472 | if ( ( $linkTitle->getNamespace() === $categoryNs || $linkTitle->getNamespace() === $fileNs ) && |
473 | $linkData->type === 'mw:WikiLink' && |
474 | $linkTarget[0] !== ':' ) { |
475 | // Escape category and file links |
476 | return ':' . $linkTarget; |
477 | } else { |
478 | return $linkTarget; |
479 | } |
480 | } |
481 | |
482 | /** |
483 | * Test if something is a URL link |
484 | * @param Env $env |
485 | * @param Element $node |
486 | * @param stdClass $linkData |
487 | * @return bool |
488 | */ |
489 | private static function isURLLink( Env $env, Element $node, stdClass $linkData ): bool { |
490 | $target = $linkData->target; |
491 | |
492 | // Get plain text content, if any |
493 | $contentStr = self::getContentString( $node ); |
494 | |
495 | // First check if we can serialize as an URL link |
496 | return ( $contentStr !== null && $contentStr !== '' ) && |
497 | // Can we minimize this? |
498 | ( $target['value'] === $contentStr || self::getHref( $env, $node ) === $contentStr ) && |
499 | // protocol-relative url links not allowed in text |
500 | // (see autourl rule in peg tokenizer, T32269) |
501 | !str_starts_with( $contentStr, '//' ) && Utils::isProtocolValid( $contentStr, $env ) && |
502 | !self::hasAutoUrlTerminatingChars( $contentStr ); |
503 | } |
504 | |
505 | /** |
506 | * The legacy parser Parser.php::makeFreeExternalLink terminates an autourl when encountering |
507 | * some characters; since we wish to mimic that behaviour we need this method to check whether |
508 | * the provided URL is in that case. |
509 | * @param string $url |
510 | * @return bool |
511 | */ |
512 | private static function hasAutoUrlTerminatingChars( string $url ): bool { |
513 | $sep = TokenizerUtils::getAutoUrlTerminatingChars( strpos( $url, '(' ) !== false ); |
514 | return str_contains( $sep, substr( $url, -1 ) ); |
515 | } |
516 | |
517 | /** |
518 | * Figure out if we need a piped or simple link |
519 | * @param Env $env |
520 | * @param DataParsoid $dp |
521 | * @param array $target |
522 | * @param stdClass $linkData |
523 | * @return bool |
524 | */ |
525 | private static function isSimpleWikiLink( |
526 | Env $env, DataParsoid $dp, array $target, stdClass $linkData |
527 | ): bool { |
528 | $canUseSimple = false; |
529 | $contentString = $linkData->content->string ?? null; |
530 | |
531 | // FIXME (SSS): |
532 | // 1. Revisit this logic to see if all these checks |
533 | // are still relevant or whether this can be simplified somehow. |
534 | // 2. There are also duplicate computations for env.normalizedTitleKey(..) |
535 | // and Util.decodeURIComponent(..) that could be removed. |
536 | // 3. This could potentially be refactored as if-then chains. |
537 | |
538 | // Would need to pipe for any non-string content. |
539 | // Preserve unmodified or non-minimal piped links. |
540 | if ( $contentString !== null && |
541 | ( !empty( $target['modified'] ) || !empty( $linkData->contentModified ) || |
542 | ( $dp->stx ?? null ) !== 'piped' |
543 | ) && |
544 | // Relative links are not simple |
545 | !str_starts_with( $contentString, './' ) |
546 | ) { |
547 | // Strip colon escapes from the original target as that is |
548 | // stripped when deriving the content string. |
549 | // Strip ./ prefixes as well since they are relative link prefixes |
550 | // added to all titles. |
551 | // The prefix stripping, when it occurs, also includes spaces before the prefix. |
552 | // Finally, we also remove trailing spaces because these are removed for <a> links |
553 | // by DOMNormalizer::moveTrailingSpacesOut, and we wouldn't want that to lead to the |
554 | // link getting piped for only that reason. |
555 | $strippedTargetValue = rtrim( |
556 | preg_replace( '#^\s*(:|\./)#', '', $target['value'], 1 ) |
557 | ); |
558 | |
559 | // Strip colon escape after prefix for interwikis |
560 | if ( !empty( $linkData->isInterwiki ) ) { |
561 | $strippedTargetValue = preg_replace( '#^(\w+:):#', '$1', $strippedTargetValue, 1 ); |
562 | } |
563 | |
564 | $decodedTarget = Utils::decodeWtEntities( $strippedTargetValue ); |
565 | // Deal with the protocol-relative link scenario as well |
566 | $hrefHasProto = preg_match( '#^(\w+:)?//#', $linkData->href ); |
567 | |
568 | // Normalize content string and decoded target before comparison. |
569 | // Piped links don't come down this path => it is safe to normalize both. |
570 | $contentString = str_replace( '_', ' ', $contentString ); |
571 | $decodedTarget = str_replace( '_', ' ', $decodedTarget ); |
572 | |
573 | // See if the (normalized) content matches the |
574 | // target, either shadowed or actual. |
575 | $canUseSimple = |
576 | $contentString === $decodedTarget || |
577 | // try wrapped in forward slashes in case they were stripped |
578 | ( '/' . $contentString . '/' ) === $decodedTarget || |
579 | // normalize as titles and compare |
580 | // FIXME: This will strip an interwiki prefix. Is that right? |
581 | $env->normalizedTitleKey( $contentString, true ) |
582 | === preg_replace( self::MW_TITLE_WHITESPACE_RE, '_', $decodedTarget ) || |
583 | // Relative link |
584 | ( |
585 | ( |
586 | $env->getSiteConfig()->namespaceHasSubpages( |
587 | $env->getContextTitle()->getNamespace() |
588 | ) && |
589 | preg_match( '#^\.\./.*[^/]$#D', $strippedTargetValue ) && |
590 | $contentString === $env->resolveTitle( $strippedTargetValue ) |
591 | ) || |
592 | ( |
593 | preg_match( '#^\.\./.*?/$#D', $strippedTargetValue ) && |
594 | $contentString === preg_replace( '#^(?:\.\./)+(.*?)/$#D', '$1', $strippedTargetValue, 1 ) |
595 | ) |
596 | ) || |
597 | // if content == href this could be a simple link... eg [[Foo]]. |
598 | // but if href is an absolute url with protocol, this won't |
599 | // work: [[http://example.com]] is not a valid simple link! |
600 | ( |
601 | !$hrefHasProto && |
602 | // Always compare against decoded uri because |
603 | // <a rel="mw:WikiLink" href="7%25 Solution">7%25 Solution</a></p> |
604 | // should serialize as [[7% Solution|7%25 Solution]] |
605 | ( |
606 | $contentString === Utils::decodeURIComponent( $linkData->href ) || |
607 | // normalize with underscores for comparison with href |
608 | $env->normalizedTitleKey( $contentString, true ) |
609 | === Utils::decodeURIComponent( $linkData->href ) |
610 | ) |
611 | ); |
612 | } |
613 | |
614 | return $canUseSimple; |
615 | } |
616 | |
617 | /** |
618 | * Serialize as wiki link |
619 | * @param Element $node |
620 | * @param SerializerState $state |
621 | * @param stdClass $linkData |
622 | */ |
623 | private static function serializeAsWikiLink( |
624 | Element $node, SerializerState $state, stdClass $linkData |
625 | ): void { |
626 | $contentParts = null; |
627 | $contentSrc = ''; |
628 | $isPiped = false; |
629 | $needsEscaping = true; |
630 | $env = $state->getEnv(); |
631 | $siteConfig = $env->getSiteConfig(); |
632 | $target = $linkData->target; |
633 | $dp = DOMDataUtils::getDataParsoid( $node ); |
634 | |
635 | // Decode any link that did not come from the source (data-mw/parsoid) |
636 | // Links that come from data-mw/data-parsoid will be true titles, |
637 | // but links that come from hrefs will need to be url-decoded. |
638 | // Ex: <a href="/wiki/A%3Fb">Foobar</a> |
639 | if ( empty( $target['fromsrc'] ) ) { |
640 | // Omit fragments from decoding |
641 | $hash = strpos( $target['value'], '#' ); |
642 | if ( $hash !== false ) { |
643 | $target['value'] = Utils::decodeURIComponent( substr( $target['value'], 0, $hash ) ) |
644 | . substr( $target['value'], $hash ); |
645 | } else { |
646 | $target['value'] = Utils::decodeURIComponent( $target['value'] ); |
647 | } |
648 | } |
649 | |
650 | // Special-case handling for category links |
651 | if ( $linkData->type === 'mw:PageProp/Category' ) { |
652 | // Split target and sort key in $target['value']. |
653 | // The sort key shows up as "#something" in there. |
654 | // However, watch out for parser functions that start with "{{#" |
655 | // The atomic group is essential to prevent "{{#" parser function prefix |
656 | // from getting split at the "{{" and "#" where the "{{" matches the |
657 | // [^#]* and the "#" matches after separately. |
658 | if ( preg_match( '/^((?>{{#|[^#])*)#(.*)/', $target['value'], $targetParts ) ) { |
659 | $target['value'] = strtr( preg_replace( '#^(\.\.?/)*#', '', $targetParts[1], 1 ), '_', ' ' ); |
660 | // FIXME: Reverse `Sanitizer.sanitizeTitleURI(strContent).replace(/#/g, '%23');` |
661 | $strContent = Utils::decodeURIComponent( $targetParts[2] ); |
662 | $contentParts = self::splitLinkContentString( $strContent, $dp ); |
663 | $linkData->content->string = $contentParts->contentString; |
664 | $dp->tail = $linkData->tail = $contentParts->tail; |
665 | $dp->prefix = $linkData->prefix = $contentParts->prefix; |
666 | } else { // No sort key, will serialize to simple link |
667 | // Normalize the content string |
668 | $linkData->content->string = strtr( |
669 | PHPUtils::stripPrefix( $target['value'], './' ), '_', ' ' |
670 | ); |
671 | } |
672 | |
673 | // Special-case handling for template-affected sort keys |
674 | // FIXME: sort keys cannot be modified yet, but if they are, |
675 | // we need to fully shadow the sort key. |
676 | // if ( !target.modified ) { |
677 | // The target and source key was not modified |
678 | $sortKeySrc = $state->serializer->serializedAttrVal( $node, 'mw:sortKey' ); |
679 | if ( isset( $sortKeySrc['value'] ) ) { |
680 | $linkData->contentNode = null; |
681 | $linkData->content->string = $sortKeySrc['value']; |
682 | // TODO: generalize this flag. It is already used by |
683 | // getAttributeShadowInfo. Maybe use the same |
684 | // structure as its return value? |
685 | $linkData->content->fromsrc = true; |
686 | } |
687 | // } |
688 | } else { |
689 | if ( $linkData->type === 'mw:PageProp/Language' ) { |
690 | // Fix up the content string |
691 | // TODO: see if linkData can be cleaner! |
692 | $linkData->content->string ??= Utils::decodeWtEntities( $target['value'] ); |
693 | } |
694 | } |
695 | |
696 | // The string value of the content, if it is plain text. |
697 | $linkTarget = null; |
698 | $escapedTgt = null; |
699 | if ( !empty( $linkData->isRedirect ) ) { |
700 | $linkTarget = $target['value']; |
701 | if ( !empty( $target['modified'] ) || empty( $target['fromsrc'] ) ) { |
702 | $linkTarget = strtr( preg_replace( '#^(\.\.?/)*#', '', $linkTarget, 1 ), '_', ' ' ); |
703 | $escapedTgt = self::escapeLinkTarget( $linkTarget, $state ); |
704 | $linkTarget = $escapedTgt->linkTarget; |
705 | // Determine if it's a redirect to a category, in which case |
706 | // it needs a ':' on front to distingish from a category link. |
707 | if ( preg_match( '/^([^:]+)[:]/', $linkTarget, $categoryMatch ) ) { |
708 | $ns = $siteConfig->namespaceId( Utils::normalizeNamespaceName( $categoryMatch[1] ) ); |
709 | if ( $ns === $siteConfig->canonicalNamespaceId( 'category' ) ) { |
710 | // Check that the next node isn't a category link, |
711 | // in which case we don't want the ':'. |
712 | $nextNode = $node->nextSibling; |
713 | if ( !( |
714 | $nextNode instanceof Element && DOMCompat::nodeName( $nextNode ) === 'link' && |
715 | DOMUtils::hasRel( $nextNode, 'mw:PageProp/Category' ) && |
716 | DOMCompat::getAttribute( $nextNode, 'href' ) === DOMCompat::getAttribute( $node, 'href' ) |
717 | ) ) { |
718 | $linkTarget = ':' . $linkTarget; |
719 | } |
720 | } |
721 | } |
722 | } |
723 | } elseif ( self::isSimpleWikiLink( $env, $dp, $target, $linkData ) ) { |
724 | // Simple case |
725 | if ( empty( $target['modified'] ) && empty( $linkData->contentModified ) ) { |
726 | $linkTarget = PHPUtils::stripPrefix( $target['value'], './' ); |
727 | } else { |
728 | // If token has templated attrs or is a subpage, use target.value |
729 | // since content string will be drastically different. |
730 | if ( WTUtils::hasExpandedAttrsType( $node ) || |
731 | preg_match( '#(^|/)\.\./#', $target['value'] ) |
732 | ) { |
733 | $linkTarget = PHPUtils::stripPrefix( $target['value'], './' ); |
734 | } else { |
735 | $escapedTgt = self::escapeLinkTarget( $linkData->content->string, $state ); |
736 | if ( !$escapedTgt->invalidLink ) { |
737 | $linkTarget = self::addColonEscape( $env, $escapedTgt->linkTarget, $linkData ); |
738 | } else { |
739 | $linkTarget = $escapedTgt->linkTarget; |
740 | } |
741 | } |
742 | if ( !empty( $linkData->isInterwikiLang ) && |
743 | $linkTarget[0] !== ':' && |
744 | $linkData->type !== 'mw:PageProp/Language' |
745 | ) { |
746 | // ensure interwiki links can't be confused with |
747 | // interlanguage links. |
748 | $linkTarget = ':' . $linkTarget; |
749 | } |
750 | } |
751 | } elseif ( self::isURLLink( $state->getEnv(), $node, $linkData ) |
752 | /* && empty( $linkData->isInterwiki ) */ |
753 | ) { |
754 | // Uncomment the above check if we want [[wikipedia:Foo|http://en.wikipedia.org/wiki/Foo]] |
755 | // for '<a href="http://en.wikipedia.org/wiki/Foo">http://en.wikipedia.org/wiki/Foo</a>' |
756 | $linkData->linkType = 'mw:URLLink'; |
757 | } else { |
758 | // Emit piped wikilink syntax |
759 | $isPiped = true; |
760 | |
761 | // First get the content source |
762 | if ( !empty( $linkData->contentNode ) ) { |
763 | $cs = $state->serializeLinkChildrenToString( |
764 | $linkData->contentNode, |
765 | [ $state->serializer->wteHandlers, 'wikilinkHandler' ] |
766 | ); |
767 | // strip off the tail and handle the pipe trick |
768 | $contentParts = self::splitLinkContentString( $cs, $dp ); |
769 | $contentSrc = $contentParts->contentString; |
770 | $dp->tail = $contentParts->tail; |
771 | $linkData->tail = $contentParts->tail; |
772 | $dp->prefix = $contentParts->prefix; |
773 | $linkData->prefix = $contentParts->prefix; |
774 | $needsEscaping = false; |
775 | } else { |
776 | $contentSrc = $linkData->content->string ?? ''; |
777 | $needsEscaping = empty( $linkData->content->fromsrc ); |
778 | } |
779 | |
780 | if ( $contentSrc === '' && $linkData->type !== 'mw:PageProp/Category' ) { |
781 | // Protect empty link content from PST pipe trick |
782 | $contentSrc = '<nowiki/>'; |
783 | $needsEscaping = false; |
784 | } |
785 | |
786 | $linkTarget = $target['value']; |
787 | if ( !empty( $target['modified'] ) || empty( $target['fromsrc'] ) ) { |
788 | // Links starting with ./ shouldn't get _ replaced with ' ' |
789 | $linkContentIsRelative = str_starts_with( $linkData->content->string ?? '', './' ); |
790 | $linkTarget = preg_replace( '#^(\.\.?/)*#', '', $linkTarget, 1 ); |
791 | if ( empty( $linkData->isInterwiki ) && !$linkContentIsRelative ) { |
792 | $linkTarget = strtr( $linkTarget, '_', ' ' ); |
793 | } |
794 | $escapedTgt = self::escapeLinkTarget( $linkTarget, $state ); |
795 | $linkTarget = $escapedTgt->linkTarget; |
796 | } |
797 | |
798 | // If we are reusing the target from source, we don't |
799 | // need to worry about colon-escaping because it will |
800 | // be in the right form already. |
801 | // |
802 | // Trying to eliminate this check and always check for |
803 | // colon-escaping seems a bit tricky when the reused |
804 | // target has encoded entities that won't resolve to |
805 | // valid titles. |
806 | if ( ( !$escapedTgt || !$escapedTgt->invalidLink ) && empty( $target['fromsrc'] ) ) { |
807 | $linkTarget = self::addColonEscape( $env, $linkTarget, $linkData ); |
808 | } |
809 | } |
810 | if ( $linkData->linkType === 'mw:URLLink' ) { |
811 | $state->emitChunk( new AutoURLLinkText( $node->textContent, $node ), $node ); |
812 | return; |
813 | } |
814 | |
815 | if ( !empty( $linkData->isRedirect ) ) { |
816 | // Drop duplicates |
817 | if ( $state->redirectText !== null ) { |
818 | return; |
819 | } |
820 | |
821 | // Buffer redirect text if it is not in start of file position |
822 | if ( !preg_match( self::REDIRECT_TEST_RE, $state->out . $state->currLine->text ) ) { |
823 | $state->redirectText = $linkData->prefix . '[[' . $linkTarget . ']]'; |
824 | $state->emitChunk( '', $node ); // Flush separators for this node |
825 | // Flush separators for this node |
826 | return; |
827 | } |
828 | |
829 | // Set to some non-null string |
830 | $state->redirectText = 'unbuffered'; |
831 | } |
832 | |
833 | $pipedText = null; |
834 | if ( $escapedTgt && $escapedTgt->invalidLink ) { |
835 | // If the link target was invalid, instead of emitting an invalid link, |
836 | // omit the link and serialize just the content instead. But, log the |
837 | // invalid html for Parsoid clients to investigate later. |
838 | $state->getEnv()->log( |
839 | 'error/html2wt/link', 'Bad title text', DOMCompat::getOuterHTML( $node ) |
840 | ); |
841 | |
842 | // For non-piped content, use the original invalid link text |
843 | $pipedText = $isPiped ? $contentSrc : $linkTarget; |
844 | $state->needsEscaping = $needsEscaping; |
845 | $state->emitChunk( $linkData->prefix . $pipedText . $linkData->tail, $node ); |
846 | } else { |
847 | $pipe = $dp->firstPipeSrc ?? '|'; |
848 | if ( $isPiped && $needsEscaping ) { |
849 | // We are definitely not in sol context since content |
850 | // will be preceded by "[[" or "[" text in target wikitext. |
851 | $pipedText = $pipe . $state->serializer->wteHandlers |
852 | ->escapeLinkContent( $state, $contentSrc, false, $node, false ); |
853 | } elseif ( $isPiped ) { |
854 | $pipedText = $pipe . $contentSrc; |
855 | } else { |
856 | $pipedText = ''; |
857 | } |
858 | if ( $isPiped ) { |
859 | $state->singleLineContext->disable(); |
860 | } |
861 | $state->emitChunk( new WikiLinkText( |
862 | $linkData->prefix . '[[' . $linkTarget . $pipedText . ']]' . $linkData->tail, |
863 | $node, $siteConfig, $linkData->type |
864 | ), $node ); |
865 | if ( $isPiped ) { |
866 | $state->singleLineContext->pop(); |
867 | } |
868 | } |
869 | } |
870 | |
871 | /** |
872 | * Serialize as external link |
873 | * @param Element $node |
874 | * @param SerializerState $state |
875 | * @param stdClass $linkData |
876 | */ |
877 | private static function serializeAsExtLink( |
878 | Element $node, SerializerState $state, stdClass $linkData |
879 | ): void { |
880 | $target = $linkData->target; |
881 | $urlStr = $target['value']; |
882 | if ( !empty( $target['modified'] ) || empty( $target['fromsrc'] ) ) { |
883 | // We expect modified hrefs to be percent-encoded already, so |
884 | // don't need to encode them here any more. Unmodified hrefs are |
885 | // just using the original encoding anyway. |
886 | // BUT we do have to encode certain special wikitext |
887 | // characters (like []) which aren't necessarily |
888 | // percent-encoded because they are valid in URLs and HTML5 |
889 | $urlStr = self::escapeExtLinkURL( $urlStr ); |
890 | } |
891 | |
892 | if ( self::isURLLink( $state->getEnv(), $node, $linkData ) ) { |
893 | // Serialize as URL link |
894 | $state->emitChunk( new AutoURLLinkText( $urlStr, $node ), $node ); |
895 | return; |
896 | } |
897 | |
898 | $siteConfig = $state->getEnv()->getSiteConfig(); |
899 | |
900 | $pureHashMatch = substr( $urlStr, 0, 1 ) === '#'; |
901 | // Fully serialize the content |
902 | $contentStr = $state->serializeLinkChildrenToString( |
903 | $node, |
904 | [ $state->serializer->wteHandlers, $pureHashMatch ? 'wikilinkHandler' : 'aHandler' ] |
905 | ); |
906 | |
907 | // serialize as auto-numbered external link |
908 | // [http://example.com] |
909 | $linktext = null; |
910 | $class = null; |
911 | // If it's just anchor text, serialize as an internal link. |
912 | if ( $pureHashMatch ) { |
913 | $class = WikiLinkText::class; |
914 | $linktext = '[[' . $urlStr . ( ( $contentStr ) ? '|' . $contentStr : '' ) . ']]'; |
915 | } else { |
916 | $class = ExtLinkText::class; |
917 | $linktext = '[' . $urlStr . ( ( $contentStr ) ? ' ' . $contentStr : '' ) . ']'; |
918 | } |
919 | $state->emitChunk( new $class( $linktext, $node, $siteConfig, $linkData->type ), $node ); |
920 | } |
921 | |
922 | /** |
923 | * Main link handler. |
924 | * @param SerializerState $state |
925 | * @param Element $node |
926 | */ |
927 | public static function linkHandler( SerializerState $state, Element $node ): void { |
928 | // TODO: handle internal/external links etc using RDFa and dataParsoid |
929 | // Also convert unannotated html links without advanced attributes to |
930 | // external wiki links for html import. Might want to consider converting |
931 | // relative links without path component and file extension to wiki links. |
932 | $env = $state->getEnv(); |
933 | $siteConfig = $env->getSiteConfig(); |
934 | |
935 | // Get the rt data from the token and tplAttrs |
936 | $linkData = self::getLinkRoundTripData( $env, $node, $state ); |
937 | $linkType = $linkData->type; |
938 | // If this could be a magic link, serialize it as a magic link by |
939 | // changing the link type to ExtLink. (If magic links are disabled, then |
940 | // the ExtResourceURLPatternMatcher() will return false.) |
941 | $magicLinkMatch = $siteConfig->getExtResourceURLPatternMatcher()( Utils::decodeURI( $linkData->origHref ) ); |
942 | if ( $magicLinkMatch !== false ) { |
943 | if ( |
944 | $magicLinkMatch[0] === 'PMID' && |
945 | DOMUtils::matchRel( $node, '|^mw:WikiLink/Interwiki\b|' ) !== null && |
946 | $linkType === 'mw:WikiLink' |
947 | ) { |
948 | // Round-trip PMIDs as interwikis if that's how they were |
949 | // originally. (Don't change the link type.) |
950 | } else { |
951 | $contentStr = $state->serializeLinkChildrenToString( |
952 | $node, |
953 | [ $state->serializer->wteHandlers, 'aHandler' ] |
954 | ); |
955 | $serialized = $siteConfig->makeExtResourceURL( |
956 | $magicLinkMatch, $linkData->origHref, $contentStr |
957 | ); |
958 | if ( $serialized[0] !== '[' ) { |
959 | // Successfully serialized as a magic link |
960 | $state->emitChunk( new MagicLinkText( $serialized, $node ), $node ); |
961 | return; |
962 | } |
963 | } |
964 | } |
965 | if ( $linkType !== null && isset( $linkData->target['value'] ) ) { |
966 | // We have a type and target info |
967 | if ( $linkType === 'mw:WikiLink' || $linkType === 'mw:MediaLink' || |
968 | preg_match( TokenUtils::SOL_TRANSPARENT_LINK_REGEX, $linkType ) |
969 | ) { |
970 | // [[..]] links: normal, category, redirect, or lang links |
971 | // (except images) |
972 | self::serializeAsWikiLink( $node, $state, $linkData ); |
973 | return; |
974 | } elseif ( $linkType === 'mw:ExtLink' ) { |
975 | // [..] links, autolinks, ISBN, RFC, PMID |
976 | self::serializeAsExtLink( $node, $state, $linkData ); |
977 | return; |
978 | } else { |
979 | throw new UnexpectedValueException( |
980 | 'Unhandled link serialization scenario: ' . DOMCompat::getOuterHTML( $node ) |
981 | ); |
982 | } |
983 | } else { |
984 | $safeAttr = [ |
985 | 'href' => true, |
986 | 'rel' => true, |
987 | 'class' => true, |
988 | 'title' => true, |
989 | DOMDataUtils::DATA_OBJECT_ATTR_NAME => true |
990 | ]; |
991 | |
992 | $isComplexLink = false; |
993 | foreach ( DOMUtils::attributes( $node ) as $name => $value ) { |
994 | // XXX: Don't drop rel and class in every case once a tags are |
995 | // actually supported in the MW default config? |
996 | if ( !isset( $safeAttr[$name] ) ) { |
997 | $isComplexLink = true; |
998 | break; |
999 | } |
1000 | } |
1001 | |
1002 | if ( $isComplexLink ) { |
1003 | $env->log( 'error/html2wt/link', 'Encountered', DOMCompat::getOuterHTML( $node ), |
1004 | '-- serializing as extlink and dropping <a> attributes unsupported in wikitext.' |
1005 | ); |
1006 | } else { |
1007 | $media = DOMUtils::selectMediaElt( $node ); // TODO: Handle missing media too |
1008 | $isFigure = $media instanceof Element && $media->parentNode === $node; |
1009 | if ( $isFigure ) { |
1010 | // this is a basic html figure: <a><img></a> |
1011 | self::figureHandler( $state, $node, new MediaStructure( $media, $node ) ); |
1012 | return; |
1013 | } |
1014 | } |
1015 | |
1016 | // href is already percent-encoded, etc., but it might contain |
1017 | // spaces or other wikitext nasties. escape the nasties. |
1018 | $hrefStr = self::escapeExtLinkURL( self::getHref( $env, $node ) ); |
1019 | $handler = [ $state->serializer->wteHandlers, 'aHandler' ]; |
1020 | $str = $state->serializeLinkChildrenToString( $node, $handler ); |
1021 | $chunk = null; |
1022 | if ( !$hrefStr ) { |
1023 | // Without an href, we just emit the string as text. |
1024 | // However, to preserve targets for anchor links, |
1025 | // serialize as a span with a name. |
1026 | $name = DOMCompat::getAttribute( $node, 'name' ); |
1027 | if ( $name !== null ) { |
1028 | $doc = $node->ownerDocument; |
1029 | $span = $doc->createElement( 'span' ); |
1030 | $span->setAttribute( 'name', $name ); |
1031 | $span->appendChild( $doc->createTextNode( $str ) ); |
1032 | $chunk = DOMCompat::getOuterHTML( $span ); |
1033 | } else { |
1034 | $chunk = $str; |
1035 | } |
1036 | } else { |
1037 | $chunk = new ExtLinkText( '[' . $hrefStr . ' ' . $str . ']', |
1038 | $node, $siteConfig, 'mw:ExtLink' |
1039 | ); |
1040 | } |
1041 | $state->emitChunk( $chunk, $node ); |
1042 | } |
1043 | } |
1044 | |
1045 | /** |
1046 | * Main figure handler. |
1047 | * |
1048 | * @param SerializerState $state |
1049 | * @param Element $node |
1050 | * @param ?MediaStructure $ms |
1051 | */ |
1052 | public static function figureHandler( |
1053 | SerializerState $state, Element $node, ?MediaStructure $ms |
1054 | ): void { |
1055 | if ( !$ms ) { |
1056 | $state->getEnv()->log( |
1057 | 'error/html2wt/figure', |
1058 | "Couldn't parse media structure: ", |
1059 | DOMCompat::getOuterHTML( $node ) |
1060 | ); |
1061 | ( new FallbackHTMLHandler )->handle( $node, $state ); |
1062 | return; |
1063 | } |
1064 | $ct = self::figureToConstrainedText( $state, $ms ); |
1065 | $state->emitChunk( $ct ?? '', $node ); |
1066 | } |
1067 | |
1068 | /** |
1069 | * Serialize a figure to contrained text. |
1070 | * |
1071 | * WARN: There's probably more to do to ensure this is purely functional, |
1072 | * no side-effects (ie. calls to state->emit) happen while processing. |
1073 | * |
1074 | * @param SerializerState $state |
1075 | * @param MediaStructure $ms |
1076 | * @return ?ConstrainedText |
1077 | */ |
1078 | public static function figureToConstrainedText( |
1079 | SerializerState $state, MediaStructure $ms |
1080 | ): ?ConstrainedText { |
1081 | $env = $state->getEnv(); |
1082 | $outerElt = $ms->containerElt ?? $ms->mediaElt; |
1083 | $linkElt = $ms->linkElt; |
1084 | $elt = $ms->mediaElt; |
1085 | $captionElt = $ms->captionElt; |
1086 | $format = WTUtils::getMediaFormat( $outerElt ); |
1087 | |
1088 | // Try to identify the local title to use for this image. |
1089 | $resource = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'resource' ); |
1090 | if ( !isset( $resource['value'] ) ) { |
1091 | // from non-parsoid HTML: try to reconstruct resource from src? |
1092 | // (this won't work for manual-thumb images) |
1093 | $src = DOMCompat::getAttribute( $elt, 'src' ); |
1094 | if ( $src === null ) { |
1095 | $env->log( 'error/html2wt/figure', |
1096 | 'In WSP.figureHandler, img does not have resource or src:', |
1097 | DOMCompat::getOuterHTML( $outerElt ) |
1098 | ); |
1099 | return null; |
1100 | } |
1101 | if ( preg_match( '/^https?:/', $src ) ) { |
1102 | // external image link, presumably $wgAllowExternalImages=true |
1103 | return new AutoURLLinkText( $src, $outerElt ); |
1104 | } |
1105 | $resource = [ |
1106 | 'value' => $src, |
1107 | 'fromsrc' => false, |
1108 | 'modified' => false |
1109 | ]; |
1110 | } |
1111 | if ( empty( $resource['fromsrc'] ) ) { |
1112 | $resource['value'] = preg_replace( '#^(\.\.?/)+#', '', $resource['value'], 1 ); |
1113 | } |
1114 | |
1115 | $nopts = []; |
1116 | $outerDP = DOMDataUtils::getDataParsoid( $outerElt ); |
1117 | $outerDMW = DOMDataUtils::getDataMw( $outerElt ); |
1118 | $mwAliases = $state->getEnv()->getSiteConfig()->mwAliases(); |
1119 | |
1120 | // Return ref to the array element in case it is modified |
1121 | $getOpt = static function & ( $key ) use ( &$outerDP ): ?array { |
1122 | $null = null; |
1123 | if ( empty( $outerDP->optList ) ) { |
1124 | return $null; |
1125 | } |
1126 | foreach ( $outerDP->optList as $opt ) { |
1127 | if ( ( $opt['ck'] ?? null ) === $key ) { |
1128 | return $opt; |
1129 | } |
1130 | } |
1131 | return $null; |
1132 | }; |
1133 | // Return ref to the array element in case it is modified |
1134 | $getLastOpt = static function & ( $key ) use ( &$outerDP ): ?array { |
1135 | $null = null; |
1136 | $opts = $outerDP->optList ?? []; |
1137 | for ( $i = count( $opts ) - 1; $i >= 0; $i-- ) { |
1138 | if ( ( $opts[$i]['ck'] ?? null ) === $key ) { |
1139 | return $opts[$i]; |
1140 | } |
1141 | } |
1142 | return $null; |
1143 | }; |
1144 | |
1145 | // Try to identify the local title to use for the link. |
1146 | $link = null; |
1147 | |
1148 | $linkFromDataMw = WTSUtils::getAttrFromDataMw( $outerDMW, 'link', true ); |
1149 | if ( $linkFromDataMw !== null ) { |
1150 | // "link" attribute on the `outerElt` takes precedence |
1151 | if ( isset( $linkFromDataMw->value['html'] ) ) { |
1152 | $link = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, 'link' ); |
1153 | } else { |
1154 | $link = [ |
1155 | 'value' => "link={$linkFromDataMw->value['txt']}", |
1156 | 'modified' => false, |
1157 | 'fromsrc' => false, |
1158 | 'fromDataMW' => true |
1159 | ]; |
1160 | } |
1161 | } elseif ( $linkElt && $linkElt->hasAttribute( 'href' ) ) { |
1162 | $link = $state->serializer->serializedImageAttrVal( $outerElt, $linkElt, 'href' ); |
1163 | if ( empty( $link['fromsrc'] ) ) { |
1164 | // strip page or lang parameter if present on href |
1165 | $strippedHref = preg_replace( |
1166 | '#[?]((?:page=\d+)|(?:lang=[a-z]+(?:-[a-z]+)*))$#Di', |
1167 | '', |
1168 | DOMCompat::getAttribute( $linkElt, 'href' ) ?? '' |
1169 | ); |
1170 | if ( $strippedHref === DOMCompat::getAttribute( $elt, 'resource' ) ) { |
1171 | // default link: same place as resource |
1172 | $link = $resource; |
1173 | } |
1174 | $link['value'] = preg_replace( '#^(\.\.?/)+#', '', $link['value'], 1 ); |
1175 | } |
1176 | } else { |
1177 | // Otherwise, just try and get it from data-mw |
1178 | $link = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, 'href' ); |
1179 | } |
1180 | |
1181 | if ( $link && empty( $link['modified'] ) && empty( $link['fromsrc'] ) ) { |
1182 | $linkOpt = $getOpt( 'link' ); |
1183 | if ( $linkOpt ) { |
1184 | $link['fromsrc'] = true; |
1185 | $link['value'] = $linkOpt['ak']; |
1186 | } |
1187 | } |
1188 | |
1189 | // Reconstruct the caption |
1190 | if ( !$captionElt && ( $outerDMW->caption ?? null ) !== null ) { |
1191 | $fragment = $outerDMW->caption; |
1192 | // FIXME: We should just be able to serialize the children of the |
1193 | // fragment, however, we need some way of marking this as being |
1194 | // inInsertedContent so that any bare text is assured to be escaped |
1195 | $captionElt = $outerElt->ownerDocument->createElement( 'div' ); |
1196 | DOMDataUtils::getDataParsoid( $captionElt )->setTempFlag( TempData::IS_NEW ); |
1197 | DOMUtils::migrateChildren( $fragment, $captionElt ); |
1198 | // Needs a parent node in order for WTS to be happy |
1199 | $fragment->appendChild( $captionElt ); |
1200 | } |
1201 | |
1202 | $caption = null; |
1203 | if ( $captionElt ) { |
1204 | $caption = $state->serializeCaptionChildrenToString( |
1205 | $captionElt, [ $state->serializer->wteHandlers, 'mediaOptionHandler' ] |
1206 | ); |
1207 | |
1208 | // Alt stuff |
1209 | if ( !WTUtils::hasVisibleCaption( $outerElt ) && $elt->hasAttribute( 'alt' ) ) { |
1210 | $altOnElt = trim( DOMCompat::getAttribute( $elt, 'alt' ) ?? '' ); |
1211 | $altFromCaption = trim( WTUtils::textContentFromCaption( $captionElt ) ); |
1212 | // The first condition is to support an empty \alt=\ option |
1213 | // when no caption is present |
1214 | if ( $altOnElt && ( $altOnElt === $altFromCaption ) ) { |
1215 | $elt->removeAttribute( 'alt' ); |
1216 | } |
1217 | } |
1218 | } |
1219 | |
1220 | // Fetch the alt (if any) |
1221 | $alt = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'alt' ); |
1222 | // Fetch the lang (if any) |
1223 | $lang = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'lang' ); |
1224 | // Fetch the muted (if any) |
1225 | $muted = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'muted' ); |
1226 | // Fetch the loop (if any) |
1227 | $loop = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'loop' ); |
1228 | |
1229 | // Ok, start assembling options, beginning with link & alt & lang |
1230 | // Other media don't have links in output. |
1231 | $linkCond = DOMCompat::nodeName( $elt ) === 'img'; |
1232 | if ( $linkCond && $link ) { |
1233 | // Check whether the link goes to the default place, in which |
1234 | // case an explicit link tag isn't needed. |
1235 | // The link may be external, or may include wikitext template markup, |
1236 | // therefore check first that it parses to a title. |
1237 | $linkTitle = $env->normalizedTitleKey( |
1238 | Utils::decodeURIComponent( $link['value'] ), true |
1239 | ); |
1240 | $resourceTitle = $env->normalizedTitleKey( |
1241 | Utils::decodeURIComponent( $resource['value'] ), true |
1242 | ); |
1243 | if ( |
1244 | $link['value'] === $resource['value'] || |
1245 | ( $linkTitle !== null && $linkTitle === $resourceTitle ) |
1246 | ) { |
1247 | $linkCond = false; // No explicit link attribute needed |
1248 | } |
1249 | } |
1250 | |
1251 | // "alt" for non-image is handle below |
1252 | $altCond = $alt['value'] !== null && DOMCompat::nodeName( $elt ) === 'img'; |
1253 | |
1254 | // This loop handles media options which *mostly* correspond 1-1 with |
1255 | // HTML attributes. `img_$name` is the name of the media option, |
1256 | // and $value is the Parsoid "shadow info" for the attribute. |
1257 | // $cond tells us whether we need to explicitly output this option; |
1258 | // if it is false we are using an implicit default. |
1259 | // `lang` and `alt` are fairly straightforward. `link` |
1260 | // is a little trickier, since we need to massage/fake the shadow |
1261 | // info because it doesn't come *directly* from the attribute. |
1262 | // link comes from the combination of a[href], img[src], and |
1263 | // img[resource], etc; |
1264 | foreach ( [ |
1265 | [ 'name' => 'link', 'value' => $link, 'cond' => $linkCond, 'alias' => 'img_link' ], |
1266 | [ 'name' => 'alt', 'value' => $alt, 'cond' => $altCond, 'alias' => 'img_alt' ], |
1267 | [ 'name' => 'lang', 'value' => $lang, 'cond' => isset( $lang['value'] ), 'alias' => 'img_lang' ], |
1268 | [ 'name' => 'muted', 'value' => $muted, 'cond' => isset( $muted['value'] ), 'alias' => 'timedmedia_muted' ], |
1269 | [ 'name' => 'loop', 'value' => $loop, 'cond' => isset( $loop['value'] ), 'alias' => 'timedmedia_loop' ], |
1270 | ] as $o ) { |
1271 | if ( !$o['cond'] ) { |
1272 | continue; |
1273 | } |
1274 | if ( $o['value'] && !empty( $o['value']['fromsrc'] ) ) { |
1275 | $nopts[] = [ |
1276 | 'ck' => $o['name'], |
1277 | 'ak' => [ $o['value']['value'] ], |
1278 | ]; |
1279 | } else { |
1280 | $value = $o['value'] ? $o['value']['value'] : ''; |
1281 | if ( $o['value'] && in_array( $o['name'], [ 'link', 'alt' ], true ) ) { |
1282 | // see WikiLinkHandler::isWikitextOpt(): link and alt are allowed |
1283 | // to contain arbitrary wikitext, even though it is stripped |
1284 | // to a string before emitting. |
1285 | $value = $state->serializer->wteHandlers->escapeLinkContent( |
1286 | $state, $value, false, $outerElt, true |
1287 | ); |
1288 | } |
1289 | $nopts[] = [ |
1290 | 'ck' => $o['name'], |
1291 | 'v' => $value, |
1292 | 'ak' => $mwAliases[$o['alias']], |
1293 | ]; |
1294 | } |
1295 | } |
1296 | |
1297 | // Now we handle media options which all come from space-separated |
1298 | // values in a single HTML attribute, `class`. (But note that there |
1299 | // can also be "extra" classes added by `img_class` as well.) |
1300 | $classes = DOMCompat::getClassList( $outerElt ); |
1301 | $extra = []; // 'extra' classes |
1302 | $val = null; |
1303 | |
1304 | foreach ( $classes as $c ) { |
1305 | switch ( $c ) { |
1306 | case 'mw-halign-none': |
1307 | case 'mw-halign-right': |
1308 | case 'mw-halign-left': |
1309 | case 'mw-halign-center': |
1310 | $val = substr( $c, 10 ); // strip mw-halign- prefix |
1311 | $nopts[] = [ |
1312 | 'ck' => $val, |
1313 | 'ak' => $mwAliases['img_' . $val], |
1314 | ]; |
1315 | break; |
1316 | |
1317 | case 'mw-valign-top': |
1318 | case 'mw-valign-middle': |
1319 | case 'mw-valign-baseline': |
1320 | case 'mw-valign-sub': |
1321 | case 'mw-valign-super': |
1322 | case 'mw-valign-text-top': |
1323 | case 'mw-valign-bottom': |
1324 | case 'mw-valign-text-bottom': |
1325 | $val = strtr( substr( $c, 10 ), '-', '_' ); // strip mw-valign and '-' to '_' |
1326 | $nopts[] = [ |
1327 | 'ck' => $val, |
1328 | 'ak' => $mwAliases['img_' . $val], |
1329 | ]; |
1330 | break; |
1331 | |
1332 | case 'mw-image-border': |
1333 | $nopts[] = [ |
1334 | 'ck' => 'border', |
1335 | 'ak' => $mwAliases['img_border'], |
1336 | ]; |
1337 | break; |
1338 | |
1339 | case 'mw-default-size': |
1340 | case 'mw-default-audio-height': |
1341 | // handled below |
1342 | break; |
1343 | |
1344 | default: |
1345 | $extra[] = $c; |
1346 | break; |
1347 | } |
1348 | } |
1349 | |
1350 | if ( count( $extra ) ) { |
1351 | $nopts[] = [ |
1352 | 'ck' => 'class', |
1353 | 'v' => implode( ' ', $extra ), |
1354 | 'ak' => $mwAliases['img_class'], |
1355 | ]; |
1356 | } |
1357 | |
1358 | // Now we handle parameters which don't have a representation |
1359 | // as HTML attributes; they are set only from the data-mw |
1360 | // values. (In theory they could perhaps be reverse engineered |
1361 | // from the thumbnail URL, but that would be fragile and expose |
1362 | // thumbnail implementation to the editor so we don't do that.) |
1363 | $mwParams = [ |
1364 | [ 'prop' => 'thumb', 'ck' => 'manualthumb', 'alias' => 'img_manualthumb' ], |
1365 | [ 'prop' => 'page', 'ck' => 'page', 'alias' => 'img_page' ], |
1366 | // Video specific |
1367 | [ 'prop' => 'starttime', 'ck' => 'starttime', 'alias' => 'timedmedia_starttime' ], |
1368 | [ 'prop' => 'endtime', 'ck' => 'endtime', 'alias' => 'timedmedia_endtime' ], |
1369 | [ 'prop' => 'thumbtime', 'ck' => 'thumbtime', 'alias' => 'timedmedia_thumbtime' ] |
1370 | ]; |
1371 | |
1372 | // `img_link` and `img_alt` are only surfaced as HTML attributes |
1373 | // for image media. For all other media we treat them as set only |
1374 | // from data-mw. |
1375 | if ( DOMCompat::nodeName( $elt ) !== 'img' ) { |
1376 | $mwParams[] = [ 'prop' => 'link', 'ck' => 'link', 'alias' => 'img_link' ]; |
1377 | $mwParams[] = [ 'prop' => 'alt', 'ck' => 'alt', 'alias' => 'img_alt' ]; |
1378 | } |
1379 | |
1380 | $hasManualthumb = false; |
1381 | foreach ( $mwParams as $o ) { |
1382 | $v = $outerDMW->{$o['prop']} ?? null; |
1383 | if ( $v === null ) { |
1384 | $a = WTSUtils::getAttrFromDataMw( $outerDMW, $o['ck'], true ); |
1385 | if ( $a !== null ) { |
1386 | if ( isset( $a->value['html'] ) ) { |
1387 | $si = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, $o['ck'] ); |
1388 | if ( isset( $si['value'] ) ) { |
1389 | $nopts[] = [ |
1390 | 'ck' => $o['ck'], |
1391 | 'ak' => [ $si['value'] ], |
1392 | ]; |
1393 | continue; |
1394 | } |
1395 | } else { |
1396 | $v = $a->value['txt']; |
1397 | } |
1398 | } |
1399 | } |
1400 | if ( $v !== null ) { |
1401 | $ak = $state->serializer->getAttributeValue( |
1402 | $outerElt, $o['ck'] |
1403 | ) ?? $mwAliases[$o['alias']]; |
1404 | $nopts[] = [ |
1405 | 'ck' => $o['ck'], |
1406 | 'ak' => $ak, |
1407 | 'v' => $v |
1408 | ]; |
1409 | // Piggyback this here ... |
1410 | if ( $o['prop'] === 'thumb' ) { |
1411 | $hasManualthumb = true; |
1412 | $format = ''; |
1413 | } |
1414 | } |
1415 | } |
1416 | |
1417 | // These media options come from the HTML `typeof` attribute. |
1418 | switch ( $format ) { |
1419 | case 'Thumb': |
1420 | $nopts[] = [ |
1421 | 'ck' => 'thumbnail', |
1422 | 'ak' => $state->serializer->getAttributeValue( |
1423 | $outerElt, 'thumbnail' |
1424 | ) ?? $mwAliases['img_thumbnail'], |
1425 | ]; |
1426 | break; |
1427 | case 'Frame': |
1428 | $nopts[] = [ |
1429 | 'ck' => 'framed', |
1430 | 'ak' => $state->serializer->getAttributeValue( |
1431 | $outerElt, 'framed' |
1432 | ) ?? $mwAliases['img_framed'], |
1433 | ]; |
1434 | break; |
1435 | case 'Frameless': |
1436 | $nopts[] = [ |
1437 | 'ck' => 'frameless', |
1438 | 'ak' => $state->serializer->getAttributeValue( |
1439 | $outerElt, 'frameless' |
1440 | ) ?? $mwAliases['img_frameless'], |
1441 | ]; |
1442 | break; |
1443 | } |
1444 | |
1445 | // Now handle the size-related options. This is complicated! |
1446 | // We consider the `height`, `data-height`, `width`, and |
1447 | // `data-width` attributes, as well as the `typeof` and the `class`. |
1448 | |
1449 | // Get the user-specified height from wikitext |
1450 | $wh = $state->serializer->serializedImageAttrVal( |
1451 | $outerElt, $elt, $ms->isRedLink() ? 'data-height' : 'height' |
1452 | ); |
1453 | // Get the user-specified width from wikitext |
1454 | $ww = $state->serializer->serializedImageAttrVal( |
1455 | $outerElt, $elt, $ms->isRedLink() ? 'data-width' : 'width' |
1456 | ); |
1457 | |
1458 | $sizeUnmodified = !empty( $ww['fromDataMW'] ) || |
1459 | ( empty( $ww['modified'] ) && empty( $wh['modified'] ) ); |
1460 | $upright = $getOpt( 'upright' ); |
1461 | |
1462 | // XXX: Infer upright factor from default size for all thumbs by default? |
1463 | // Better for scaling with user prefs, but requires knowledge about |
1464 | // default used in VE. |
1465 | if ( $sizeUnmodified && $upright && |
1466 | // Only serialize upright where it is actually respected |
1467 | // This causes some dirty diffs, but makes sure that we don't |
1468 | // produce nonsensical output after a type switch. |
1469 | // TODO: Only strip if type was actually modified. |
1470 | in_array( $format, [ 'Frameless', 'Thumb' ], true ) |
1471 | ) { |
1472 | // preserve upright option |
1473 | $nopts[] = [ |
1474 | 'ck' => $upright['ck'], |
1475 | 'ak' => [ $upright['ak'] ], // FIXME: don't use ak here! |
1476 | ]; |
1477 | } |
1478 | |
1479 | if ( |
1480 | !DOMUtils::hasClass( $outerElt, 'mw-default-size' ) && |
1481 | $format !== 'Frame' && !$hasManualthumb |
1482 | ) { |
1483 | $size = $getLastOpt( 'width' ); |
1484 | $sizeString = (string)( $size['ak'] ?? '' ); |
1485 | if ( $sizeString === '' && !empty( $ww['fromDataMW'] ) ) { |
1486 | $sizeString = (string)( $ww['value'] ?? '' ); |
1487 | } |
1488 | if ( $sizeUnmodified && $sizeString !== '' ) { |
1489 | // preserve original width/height string if not touched |
1490 | $nopts[] = [ |
1491 | 'ck' => 'width', |
1492 | 'v' => $sizeString, // original size string |
1493 | 'ak' => [ '$1' ], // don't add px or the like |
1494 | ]; |
1495 | } else { |
1496 | $bbox = null; |
1497 | // Serialize to a square bounding box |
1498 | if ( isset( $ww['value'] ) && preg_match( '/^\d+/', $ww['value'] ) ) { |
1499 | $bbox = intval( $ww['value'] ); |
1500 | } |
1501 | if ( isset( $wh['value'] ) && preg_match( '/^\d+/', $wh['value'] ) && |
1502 | // As with "mw-default-size", editing clients should remove the |
1503 | // "mw-default-audio-height" if they want to factor a defined |
1504 | // height into the bounding box size. However, note that, at |
1505 | // present, a defined height for audio is ignored while parsing, |
1506 | // so this only has the effect of modifying the width. |
1507 | ( |
1508 | DOMCompat::nodeName( $elt ) !== 'audio' || |
1509 | !DOMUtils::hasClass( $outerElt, 'mw-default-audio-height' ) |
1510 | ) |
1511 | ) { |
1512 | $height = intval( $wh['value'] ); |
1513 | if ( $bbox === null || $height > $bbox ) { |
1514 | $bbox = $height; |
1515 | } |
1516 | } |
1517 | if ( $bbox !== null ) { |
1518 | $nopts[] = [ |
1519 | 'ck' => 'width', |
1520 | // MediaWiki interprets 100px as a width |
1521 | // restriction only, so we need to make the bounding |
1522 | // box explicitly square (100x100px). The 'px' is |
1523 | // added by the alias though, and can be localized. |
1524 | 'v' => $bbox . 'x' . $bbox, |
1525 | 'ak' => $mwAliases['img_width'], // adds the 'px' suffix |
1526 | ]; |
1527 | } |
1528 | } |
1529 | } |
1530 | |
1531 | $opts = $outerDP->optList ?? []; // original wikitext options |
1532 | |
1533 | // Add bogus options from old optlist in order to round-trip cleanly (T64500) |
1534 | foreach ( $opts as $o ) { |
1535 | if ( ( $o['ck'] ?? null ) === 'bogus' ) { |
1536 | $nopts[] = [ |
1537 | 'ck' => 'bogus', |
1538 | 'ak' => [ $o['ak'] ], |
1539 | ]; |
1540 | } |
1541 | } |
1542 | |
1543 | // Put the caption last, by default. |
1544 | if ( is_string( $caption ) ) { |
1545 | $nopts[] = [ |
1546 | 'ck' => 'caption', |
1547 | 'ak' => [ $caption ], |
1548 | ]; |
1549 | } |
1550 | |
1551 | // ok, sort the new options to match the order given in the old optlist |
1552 | // and try to match up the aliases used |
1553 | $changed = false; |
1554 | foreach ( $nopts as &$no ) { |
1555 | // Make sure we have an array here. Default in data-parsoid is |
1556 | // actually a string. |
1557 | // FIXME: don't reuse ak for two different things! |
1558 | if ( !is_array( $no['ak'] ) ) { |
1559 | $no['ak'] = [ $no['ak'] ]; |
1560 | } |
1561 | |
1562 | $no['sortId'] = count( $opts ); |
1563 | $idx = -1; |
1564 | foreach ( $opts as $i => $o ) { |
1565 | if ( ( $o['ck'] ?? null ) === $no['ck'] && |
1566 | // for bogus options, make sure the source matches too. |
1567 | ( $o['ck'] !== 'bogus' || $o['ak'] === $no['ak'][0] ) |
1568 | ) { |
1569 | $idx = $i; |
1570 | break; |
1571 | } |
1572 | } |
1573 | if ( $idx < 0 ) { |
1574 | // Preferred words are first in the alias list |
1575 | // (but not in old versions of mediawiki). |
1576 | $no['ak'] = $no['ak'][0]; |
1577 | $changed = true; |
1578 | continue; |
1579 | } |
1580 | |
1581 | $no['sortId'] = $idx; |
1582 | // use a matching alias, if there is one |
1583 | $a = null; |
1584 | foreach ( $no['ak'] as $b ) { |
1585 | // note the trim() here; that allows us to snarf eccentric |
1586 | // whitespace from the original option wikitext |
1587 | $b2 = $b; |
1588 | if ( isset( $no['v'] ) ) { |
1589 | $b2 = str_replace( '$1', $no['v'], $b ); |
1590 | } |
1591 | if ( $b2 === trim( implode( ',', (array)$opts[$idx]['ak'] ) ) ) { |
1592 | $a = $b; |
1593 | break; |
1594 | } |
1595 | } |
1596 | // use the alias (incl whitespace) from the original option wikitext |
1597 | // if found; otherwise use the last alias given (English default by |
1598 | // convention that works everywhere). |
1599 | // TODO: use first alias (localized) instead for RTL languages (T53852) |
1600 | if ( $a !== null && $no['ck'] !== 'caption' ) { |
1601 | $no['ak'] = $opts[$idx]['ak']; |
1602 | unset( $no['v'] ); // prevent double substitution |
1603 | } else { |
1604 | $no['ak'] = PHPUtils::lastItem( $no['ak'] ); |
1605 | if ( !( $no['ck'] === 'caption' && $a !== null ) ) { |
1606 | $changed = true; |
1607 | } |
1608 | } |
1609 | } |
1610 | |
1611 | // Filter out bogus options if the image options/caption have changed. |
1612 | if ( $changed ) { |
1613 | $nopts = array_filter( $nopts, static function ( $no ) { |
1614 | return $no['ck'] !== 'bogus'; |
1615 | } ); |
1616 | // empty captions should get filtered out in this case, too (T64264) |
1617 | $nopts = array_filter( $nopts, static function ( $no ) { |
1618 | return !( $no['ck'] === 'caption' && $no['ak'] === '' ); |
1619 | } ); |
1620 | } |
1621 | |
1622 | // sort! |
1623 | usort( $nopts, static function ( $a, $b ) { |
1624 | return $a['sortId'] <=> $b['sortId']; |
1625 | } ); |
1626 | |
1627 | // emit all the options as wikitext! |
1628 | $wikitext = '[[' . $resource['value']; |
1629 | foreach ( $nopts as $o ) { |
1630 | $wikitext .= '|'; |
1631 | if ( isset( $o['v'] ) ) { |
1632 | $wikitext .= str_replace( '$1', $o['v'], $o['ak'] ); |
1633 | } else { |
1634 | $wikitext .= $o['ak']; |
1635 | } |
1636 | } |
1637 | $wikitext .= ']]'; |
1638 | |
1639 | return new WikiLinkText( |
1640 | $wikitext, $outerElt, $state->getEnv()->getSiteConfig(), 'mw:File' |
1641 | ); |
1642 | } |
1643 | |
1644 | } |