Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
13.73% |
35 / 255 |
|
22.22% |
6 / 27 |
CRAP | |
0.00% |
0 / 1 |
Utils | |
13.73% |
35 / 255 |
|
22.22% |
6 / 27 |
3984.94 | |
0.00% |
0 / 1 |
stripParsoidIdPrefix | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
stripNamespace | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
isParsoidObjectId | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
isVoidElement | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
cloneArray | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
12 | |||
clone | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
56 | |||
lastUniChar | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
isUniWord | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
phpURLEncode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
decodeURI | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
decodeURIComponent | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
3 | |||
extractExtBody | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
isValidOffset | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
isValidDSR | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
42 | |||
normalizeNamespaceName | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
decodeWtEntities | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
escapeWtEntities | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
2 | |||
escapeWt | |
0.00% |
0 / 53 |
|
0.00% |
0 / 1 |
42 | |||
escapeHtml | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
entityEncodeAll | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
isProtocolValid | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
getExtArgInfo | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
12 | |||
parseMediaDimensions | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
90 | |||
validateMediaParam | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
bcp47ToMwCode | |
0.00% |
0 / 26 |
|
0.00% |
0 / 1 |
6 | |||
mwCodeToBcp47 | |
0.00% |
0 / 51 |
|
0.00% |
0 / 1 |
182 | |||
isBcp47CodeEqual | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Psr\Log\LoggerInterface; |
7 | use Wikimedia\Bcp47Code\Bcp47Code; |
8 | use Wikimedia\Bcp47Code\Bcp47CodeValue; |
9 | use Wikimedia\Parsoid\Config\Env; |
10 | use Wikimedia\Parsoid\Config\SiteConfig; |
11 | use Wikimedia\Parsoid\Core\DomSourceRange; |
12 | use Wikimedia\Parsoid\Core\Sanitizer; |
13 | use Wikimedia\Parsoid\NodeData\DataMw; |
14 | use Wikimedia\Parsoid\NodeData\DataMwBody; |
15 | use Wikimedia\Parsoid\Tokens\Token; |
16 | use Wikimedia\Parsoid\Wikitext\Consts; |
17 | |
18 | /** |
19 | * This file contains general utilities for token transforms. |
20 | */ |
21 | class Utils { |
22 | /** |
23 | * Regular expression fragment for matching wikitext comments. |
24 | * Meant for inclusion in other regular expressions. |
25 | */ |
26 | // Maintenance note: this is used in /x regexes so all whitespace and # should be escaped |
27 | public const COMMENT_REGEXP_FRAGMENT = '<!--(?>[\s\S]*?-->)'; |
28 | /** Regular fragment for matching a wikitext comment */ |
29 | public const COMMENT_REGEXP = '/' . self::COMMENT_REGEXP_FRAGMENT . '/'; |
30 | |
31 | public const COMMENT_OR_WS_REGEXP = '/^(\s|' . self::COMMENT_REGEXP_FRAGMENT . ')*$/D'; |
32 | |
33 | /** |
34 | * Strip Parsoid id prefix from aboutID |
35 | * |
36 | * @param string $aboutId aboud ID string |
37 | * @return string |
38 | */ |
39 | public static function stripParsoidIdPrefix( string $aboutId ): string { |
40 | // 'mwt' is the prefix used for new ids |
41 | return preg_replace( '/^#?mwt/', '', $aboutId ); |
42 | } |
43 | |
44 | /** |
45 | * Strip PHP namespace from the fully qualified class name |
46 | * @param string $className |
47 | * @return string |
48 | */ |
49 | public static function stripNamespace( string $className ): string { |
50 | return preg_replace( '/.*\\\\/', '', $className ); |
51 | } |
52 | |
53 | /** |
54 | * Check for Parsoid id prefix in an aboutID string |
55 | * |
56 | * @param string $aboutId aboud ID string |
57 | * @return bool |
58 | */ |
59 | public static function isParsoidObjectId( string $aboutId ): bool { |
60 | // 'mwt' is the prefix used for new ids |
61 | return str_starts_with( $aboutId, '#mwt' ); |
62 | } |
63 | |
64 | /** |
65 | * Determine if the named tag is void (can not have content). |
66 | * |
67 | * @param string $name tag name |
68 | * @return bool |
69 | */ |
70 | public static function isVoidElement( string $name ): bool { |
71 | return isset( Consts::$HTML['VoidTags'][$name] ); |
72 | } |
73 | |
74 | public static function cloneArray( array $arr ): array { |
75 | return array_map( |
76 | static function ( $val ) { |
77 | if ( is_array( $val ) ) { |
78 | return self::cloneArray( $val ); |
79 | } elseif ( is_object( $val ) ) { |
80 | return clone $val; |
81 | } else { |
82 | return $val; |
83 | } |
84 | }, |
85 | $arr |
86 | ); |
87 | } |
88 | |
89 | /** |
90 | * Deep clones by default. |
91 | * @param object|array $obj arrays or plain objects |
92 | * Tokens or DOM nodes shouldn't be passed in. |
93 | * |
94 | * CAVEAT: It looks like debugging methods pass in arrays |
95 | * that can have DOM nodes. So, for debugging purposes, |
96 | * we handle top-level DOM nodes or DOM nodes embedded in arrays |
97 | * But, this will miserably fail if an object embeds a DOM node. |
98 | * |
99 | * @param bool $deepClone |
100 | * @param bool $debug |
101 | * @return object|array |
102 | * @deprecated Use native PHP cloning and Utils::cloneArray when needed |
103 | */ |
104 | public static function clone( $obj, $deepClone = true, $debug = false ) { |
105 | if ( $debug ) { |
106 | if ( $obj instanceof \DOMNode ) { |
107 | return $obj->cloneNode( $deepClone ); |
108 | } |
109 | if ( is_array( $obj ) ) { |
110 | if ( $deepClone ) { |
111 | return array_map( |
112 | static function ( $o ) { |
113 | // @phan-suppress-next-line PhanDeprecatedFunction |
114 | return Utils::clone( $o, true, true ); |
115 | }, |
116 | $obj |
117 | ); |
118 | } else { |
119 | return $obj; // Copy-on-write cloning |
120 | } |
121 | } |
122 | } |
123 | |
124 | if ( !$deepClone && is_object( $obj ) ) { |
125 | return clone $obj; |
126 | } |
127 | |
128 | // FIXME, see T161647 |
129 | // This will fail if $obj is (or embeds) a DOMNode |
130 | return unserialize( serialize( $obj ) ); |
131 | } |
132 | |
133 | /** |
134 | * Extract the last *unicode* character of the string. |
135 | * This might be more than one byte, if the last character |
136 | * is non-ASCII. |
137 | * @param string $str |
138 | * @param ?int $idx The index *after* the character to extract; defaults |
139 | * to the length of $str, which will extract the last character in |
140 | * $str. |
141 | * @return string |
142 | */ |
143 | public static function lastUniChar( string $str, ?int $idx = null ): string { |
144 | if ( $idx === null ) { |
145 | $idx = strlen( $str ); |
146 | } elseif ( $idx <= 0 || $idx > strlen( $str ) ) { |
147 | return ''; |
148 | } |
149 | $c = $str[--$idx]; |
150 | while ( ( ord( $c ) & 0xC0 ) === 0x80 ) { |
151 | $c = $str[--$idx] . $c; |
152 | } |
153 | return $c; |
154 | } |
155 | |
156 | /** |
157 | * Return true if the first character in $s is a unicode word character. |
158 | * @param string $s |
159 | * @return bool |
160 | */ |
161 | public static function isUniWord( string $s ): bool { |
162 | return preg_match( '#^\w#u', $s ) === 1; |
163 | } |
164 | |
165 | /** |
166 | * This should not be used. |
167 | * @param string $txt URL to encode using PHP encoding |
168 | * @return string |
169 | */ |
170 | public static function phpURLEncode( $txt ) { |
171 | // @phan-suppress-previous-line PhanPluginNeverReturnMethod |
172 | throw new \BadMethodCallException( 'Use urlencode( $txt ) instead' ); |
173 | } |
174 | |
175 | /** |
176 | * Percent-decode only valid UTF-8 characters, leaving other encoded bytes alone. |
177 | * |
178 | * Distinct from `decodeURIComponent` in that certain escapes are not decoded, |
179 | * matching the behavior of JavaScript's decodeURI(). |
180 | * |
181 | * @see https://www.ecma-international.org/ecma-262/6.0/#sec-decodeuri-encodeduri |
182 | * @param string $s URI to be decoded |
183 | * @return string |
184 | */ |
185 | public static function decodeURI( string $s ): string { |
186 | // Escape the '%' in sequences for the reserved characters, then use decodeURIComponent. |
187 | $s = preg_replace( '/%(?=2[346bcfBCF]|3[abdfABDF]|40)/', '%25', $s ); |
188 | return self::decodeURIComponent( $s ); |
189 | } |
190 | |
191 | /** |
192 | * Percent-decode only valid UTF-8 characters, leaving other encoded bytes alone. |
193 | * |
194 | * @param string $s URI to be decoded |
195 | * @return string |
196 | */ |
197 | public static function decodeURIComponent( string $s ): string { |
198 | // Most of the time we should have valid input |
199 | $ret = rawurldecode( $s ); |
200 | if ( mb_check_encoding( $ret, 'UTF-8' ) ) { |
201 | return $ret; |
202 | } |
203 | |
204 | // Extract each encoded character and decode it individually |
205 | return preg_replace_callback( |
206 | // phpcs:ignore Generic.Files.LineLength.TooLong |
207 | '/%[0-7][0-9A-F]|%[CD][0-9A-F]%[89AB][0-9A-F]|%E[0-9A-F](?:%[89AB][0-9A-F]){2}|%F[0-4](?:%[89AB][0-9A-F]){3}/i', |
208 | static function ( $match ) { |
209 | $ret = rawurldecode( $match[0] ); |
210 | return mb_check_encoding( $ret, 'UTF-8' ) ? $ret : $match[0]; |
211 | }, $s |
212 | ); |
213 | } |
214 | |
215 | /** |
216 | * Extract extension source from the token |
217 | * |
218 | * @param Token $token token |
219 | * @return string |
220 | */ |
221 | public static function extractExtBody( Token $token ): string { |
222 | $src = $token->getAttributeV( 'source' ); |
223 | $extTagOffsets = $token->dataParsoid->extTagOffsets; |
224 | '@phan-var \Wikimedia\Parsoid\Core\DomSourceRange $extTagOffsets'; |
225 | return $extTagOffsets->stripTags( $src ); |
226 | } |
227 | |
228 | /** |
229 | * Helper function checks numeric values |
230 | * |
231 | * @param ?int $n checks parameters for numeric type and value zero or positive |
232 | * @return bool |
233 | */ |
234 | private static function isValidOffset( ?int $n ): bool { |
235 | return $n !== null && $n >= 0; |
236 | } |
237 | |
238 | /** |
239 | * Basic check if a DOM Source Range (DSR) is valid. |
240 | * |
241 | * Clarifications about the "basic validity checks": |
242 | * - Only checks for underflow, not for overflow. |
243 | * - Does not verify that start <= end |
244 | * - Does not verify that openWidth + endWidth <= end - start |
245 | * (even so, the values might be invalid because of content) |
246 | * These would be overkill for our purposes. Given how DSR computation |
247 | * works in thie codebase, the real scenarios we care about are |
248 | * non-null / non-negative values since that can happen. |
249 | * |
250 | * @param ?DomSourceRange $dsr DSR source range values |
251 | * @param bool $all Also check the widths of the container tag |
252 | * @return bool |
253 | */ |
254 | public static function isValidDSR( |
255 | ?DomSourceRange $dsr, bool $all = false |
256 | ): bool { |
257 | return $dsr !== null && |
258 | self::isValidOffset( $dsr->start ) && |
259 | self::isValidOffset( $dsr->end ) && |
260 | ( !$all || ( |
261 | self::isValidOffset( $dsr->openWidth ) && |
262 | self::isValidOffset( $dsr->closeWidth ) |
263 | ) |
264 | ); |
265 | } |
266 | |
267 | /** |
268 | * Cannonicalizes a namespace name. |
269 | * |
270 | * @param string $name Non-normalized namespace name. |
271 | * @return string |
272 | */ |
273 | public static function normalizeNamespaceName( string $name ): string { |
274 | return strtr( mb_strtolower( $name ), ' ', '_' ); |
275 | } |
276 | |
277 | /** |
278 | * Decode HTML5 entities in wikitext. |
279 | * |
280 | * NOTE that wikitext only allows semicolon-terminated entities, while |
281 | * HTML allows a number of "legacy" entities to be decoded without |
282 | * a terminating semicolon. This function deliberately does not |
283 | * decode these HTML-only entity forms. |
284 | * |
285 | * @param string $text |
286 | * @return string |
287 | */ |
288 | public static function decodeWtEntities( string $text ): string { |
289 | // Note that HTML5 allows semicolon-less entities which |
290 | // wikitext does not: in wikitext all entities must end in a |
291 | // semicolon. |
292 | // By normalizing before decoding, this routine deliberately |
293 | // does not decode entity references which are invalid in wikitext |
294 | // (mostly because they decode to invalid codepoints). |
295 | return Sanitizer::decodeCharReferences( |
296 | Sanitizer::normalizeCharReferences( $text ) |
297 | ); |
298 | } |
299 | |
300 | /** |
301 | * Entity-escape anything that would decode to a valid wikitext entity. |
302 | * |
303 | * Note that HTML5 allows certain "semicolon-less" entities, like |
304 | * `¶`; these aren't allowed in wikitext and won't be escaped |
305 | * by this function. |
306 | * |
307 | * @param string $text |
308 | * @return string |
309 | */ |
310 | public static function escapeWtEntities( string $text ): string { |
311 | // We just want to encode ampersands that precede valid entities. |
312 | // (And note that semicolon-less entities aren't valid wikitext.) |
313 | return preg_replace_callback( '/&[#0-9a-zA-Z\x80-\xff]+;/', function ( $match ) { |
314 | $m = $match[0]; |
315 | $decodedChar = self::decodeWtEntities( $m ); |
316 | if ( $decodedChar !== $m ) { |
317 | // Escape the ampersand |
318 | return '&' . substr( $m, 1 ); |
319 | } else { |
320 | // Not an entity, just return the string |
321 | return $m; |
322 | } |
323 | }, $text ); |
324 | } |
325 | |
326 | /** |
327 | * Ensure that the given literal string is safe to parse as wikitext. |
328 | * See wfEscapeWikiText() in core. |
329 | */ |
330 | public static function escapeWt( string $input ): string { |
331 | static $repl = null, $repl2 = null, $repl3 = null, $repl4 = null; |
332 | if ( $repl === null ) { |
333 | $repl = [ |
334 | '"' => '"', '&' => '&', "'" => ''', '<' => '<', |
335 | '=' => '=', '>' => '>', '[' => '[', ']' => ']', |
336 | '{' => '{', '|' => '|', '}' => '}', |
337 | ';' => ';', // a token inside language converter brackets |
338 | '!!' => '!!', // a token inside table context |
339 | "\n!" => "\n!", "\r!" => "\r!", // a token inside table context |
340 | "\n#" => "\n#", "\r#" => "\r#", |
341 | "\n*" => "\n*", "\r*" => "\r*", |
342 | "\n:" => "\n:", "\r:" => "\r:", |
343 | "\n " => "\n ", "\r " => "\r ", |
344 | "\n\n" => "\n ", "\r\n" => " \n", |
345 | "\n\r" => "\n ", "\r\r" => "\r ", |
346 | "\n\t" => "\n	", "\r\t" => "\r	", // "\n\t\n" is treated like "\n\n" |
347 | "\n----" => "\n----", "\r----" => "\r----", |
348 | '__' => '__', '://' => '://', |
349 | '~~~' => '~~~', // protect from PST, just to be safe(r) |
350 | ]; |
351 | |
352 | $magicLinks = [ 'ISBN', 'PMID', 'RFC' ]; |
353 | // We have to catch everything "\s" matches in PCRE |
354 | foreach ( $magicLinks as $magic ) { |
355 | $repl["$magic "] = "$magic "; |
356 | $repl["$magic\t"] = "$magic	"; |
357 | $repl["$magic\r"] = "$magic "; |
358 | $repl["$magic\n"] = "$magic "; |
359 | $repl["$magic\f"] = "$magic"; |
360 | } |
361 | // Additionally escape the following characters at the beginning of the |
362 | // string, in case they merge to form tokens when spliced into a |
363 | // string. Tokens like -{ {{ [[ {| etc are already escaped because |
364 | // the second character is escaped above, but the following tokens |
365 | // are handled here: |+ |- __FOO__ ~~~ |
366 | $repl3 = [ |
367 | '+' => '+', '-' => '-', '_' => '_', '~' => '~', |
368 | ]; |
369 | // Similarly, protect the following characters at the end of the |
370 | // string, which could turn form the start of `__FOO__` or `~~~~` |
371 | // A trailing newline could also form the unintended start of a |
372 | // paragraph break if it is glued to a newline in the following |
373 | // context. |
374 | $repl4 = [ |
375 | '_' => '_', '~' => '~', |
376 | "\n" => " ", "\r" => " ", |
377 | "\t" => "	", // "\n\t\n" is treated like "\n\n" |
378 | ]; |
379 | |
380 | // And handle protocols that don't use "://" |
381 | $urlProtocols = [ |
382 | 'bitcoin:', 'geo:', 'magnet:', 'mailto:', 'matrix:', 'news:', |
383 | 'sip:', 'sips:', 'sms:', 'tel:', 'urn:', 'xmpp:', |
384 | ]; |
385 | $repl2 = []; |
386 | foreach ( $urlProtocols as $prot ) { |
387 | $repl2[] = preg_quote( substr( $prot, 0, -1 ), '/' ); |
388 | } |
389 | $repl2 = '/\b(' . implode( '|', $repl2 ) . '):/i'; |
390 | } |
391 | // Tell phan that $repl2, $repl3 and $repl4 will also be non-null here |
392 | '@phan-var string $repl2'; |
393 | '@phan-var string $repl3'; |
394 | '@phan-var string $repl4'; |
395 | // This will also stringify input in case it's not a string |
396 | $text = substr( strtr( "\n$input", $repl ), 1 ); |
397 | if ( $text === '' ) { |
398 | return $text; |
399 | } |
400 | $first = strtr( $text[0], $repl3 ); // protect first character |
401 | if ( strlen( $text ) > 1 ) { |
402 | $text = $first . substr( $text, 1, -1 ) . |
403 | strtr( substr( $text, -1 ), $repl4 ); // protect last character |
404 | } else { |
405 | // special case for single-character strings |
406 | $text = strtr( $first, $repl4 ); // protect last character |
407 | } |
408 | $text = preg_replace( $repl2, '$1:', $text ); |
409 | return $text; |
410 | } |
411 | |
412 | /** |
413 | * Convert special characters to HTML entities |
414 | * |
415 | * @param string $s |
416 | * @return string |
417 | */ |
418 | public static function escapeHtml( string $s ): string { |
419 | // Only encodes five characters: " ' & < > |
420 | $s = htmlspecialchars( $s, ENT_QUOTES | ENT_HTML5 ); |
421 | $s = str_replace( "\u{0338}", '̸', $s ); |
422 | return $s; |
423 | } |
424 | |
425 | /** |
426 | * Encode all characters as entity references. This is done to make |
427 | * characters safe for wikitext (regardless of whether they are |
428 | * HTML-safe). Typically only called with single-codepoint strings. |
429 | * @param string $s |
430 | * @return string |
431 | */ |
432 | public static function entityEncodeAll( string $s ): string { |
433 | // This is Unicode aware. |
434 | static $conventions = [ |
435 | // We always use at least two characters for the hex code |
436 | '�' => '�', '' => '', '' => '', '' => '', |
437 | '' => '', '' => '', '' => '', '' => '', |
438 | '' => '', '	' => '	', '
' => '
', '' => '', |
439 | '' => '', '
' => '
', '' => '', '' => '', |
440 | // By convention we use where possible |
441 | ' ' => ' ', |
442 | ]; |
443 | |
444 | return strtr( mb_encode_numericentity( |
445 | $s, [ 0, 0x10ffff, 0, ~0 ], 'utf-8', true |
446 | ), $conventions ); |
447 | } |
448 | |
449 | /** |
450 | * Determine whether the protocol of a link is potentially valid. Use the |
451 | * environment's per-wiki config to do so. |
452 | * |
453 | * @param mixed $linkTarget |
454 | * @param Env $env |
455 | * @return bool |
456 | */ |
457 | public static function isProtocolValid( $linkTarget, Env $env ): bool { |
458 | $siteConf = $env->getSiteConfig(); |
459 | if ( is_string( $linkTarget ) ) { |
460 | return $siteConf->hasValidProtocol( $linkTarget ); |
461 | } else { |
462 | return true; |
463 | } |
464 | } |
465 | |
466 | /** |
467 | * Get argument information for an extension tag token. |
468 | * |
469 | * @param Token $extToken |
470 | * @return DataMw |
471 | */ |
472 | public static function getExtArgInfo( Token $extToken ): DataMw { |
473 | $name = $extToken->getAttributeV( 'name' ); |
474 | $options = $extToken->getAttributeV( 'options' ); |
475 | $defaultDataMw = new DataMw( [ |
476 | 'name' => $name, |
477 | // Back-compat w/ existing DOM spec output: ensure 'attrs' |
478 | // exists even if there are no attributes. |
479 | 'attrs' => (object)[], |
480 | ] ); |
481 | foreach ( TokenUtils::kvToHash( $options ) as $name => $value ) { |
482 | // Explicit cast to string is needed here, since a numeric |
483 | // attribute name will get converted to 'int' when it is used |
484 | // as an array key. |
485 | $defaultDataMw->setExtAttrib( (string)$name, $value ); |
486 | } |
487 | $extTagOffsets = $extToken->dataParsoid->extTagOffsets; |
488 | if ( $extTagOffsets->closeWidth !== 0 ) { |
489 | // If not self-closing... |
490 | $defaultDataMw->body = new DataMwBody( |
491 | self::extractExtBody( $extToken ), |
492 | ); |
493 | } |
494 | return $defaultDataMw; |
495 | } |
496 | |
497 | /** |
498 | * Parse media dimensions |
499 | * |
500 | * @param SiteConfig $siteConfig |
501 | * @param string $str media dimension string to parse |
502 | * @param bool $onlyOne If set, returns null if multiple dimenstions are present |
503 | * @param bool $localized Defaults to false; set to true if the $str |
504 | * has already been matched against `img_width` to localize the `px` |
505 | * suffix. |
506 | * @return ?array{x:int,y?:int,bogusPx:bool} |
507 | */ |
508 | public static function parseMediaDimensions( |
509 | SiteConfig $siteConfig, string $str, bool $onlyOne = false, |
510 | bool $localized = false |
511 | ): ?array { |
512 | if ( !$localized ) { |
513 | $getOption = $siteConfig->getMediaPrefixParameterizedAliasMatcher(); |
514 | $bits = $getOption( $str ); |
515 | $normalizedBit0 = $bits ? mb_strtolower( trim( $bits['k'] ) ) : null; |
516 | if ( $normalizedBit0 === 'img_width' ) { |
517 | $str = $bits['v']; |
518 | } |
519 | } |
520 | $dimensions = null; |
521 | // We support a trailing 'px' here for historical reasons |
522 | // (T15500, T53628, T207032) |
523 | if ( preg_match( '/^(\d*)(?:x(\d+))?\s*(px\s*)?$/D', $str, $match ) ) { |
524 | $dimensions = [ 'x' => null, 'y' => null, 'bogusPx' => false ]; |
525 | if ( !empty( $match[1] ) ) { |
526 | $dimensions['x'] = intval( $match[1], 10 ); |
527 | } |
528 | if ( !empty( $match[2] ) ) { |
529 | if ( $onlyOne ) { |
530 | return null; |
531 | } |
532 | $dimensions['y'] = intval( $match[2], 10 ); |
533 | } |
534 | if ( !empty( $match[3] ) ) { |
535 | $dimensions['bogusPx'] = true; |
536 | } |
537 | } |
538 | return $dimensions; |
539 | } |
540 | |
541 | /** |
542 | * Validate media parameters |
543 | * More generally, this is defined by the media handler in core |
544 | * |
545 | * @param ?int $num |
546 | * @return bool |
547 | */ |
548 | public static function validateMediaParam( ?int $num ): bool { |
549 | return $num !== null && $num > 0; |
550 | } |
551 | |
552 | /** |
553 | * Convert BCP-47-compliant language code to MediaWiki-internal code. |
554 | * |
555 | * This is a temporary back-compatibility hack; Parsoid should be |
556 | * using BCP 47 strings or Bcp47Code objects in all its external APIs. |
557 | * Try to avoid using it, though: there's no guarantee |
558 | * that this mapping will remain in sync with upstream. |
559 | * |
560 | * @param string|Bcp47Code $code BCP-47 language code |
561 | * @return string MediaWiki-internal language code |
562 | */ |
563 | public static function bcp47ToMwCode( $code ): string { |
564 | // This map is dumped from |
565 | // LanguageCode::NON_STANDARD_LANGUAGE_CODE_MAPPING in core, but |
566 | // with keys and values swapped and BCP-47 codes lowercased: |
567 | // |
568 | // array_flip(array_map(strtolower, |
569 | // LanguageCode::NON_STANDARD_LANGUAGE_CODE_MAPPING)) |
570 | // |
571 | // Hopefully we will be able to deprecate and remove this from |
572 | // Parsoid quickly enough that keeping it in sync with upstream |
573 | // is not an issue. |
574 | static $MAP = [ |
575 | "cbk" => "cbk-zam", |
576 | "de-x-formal" => "de-formal", |
577 | "egl" => "eml", |
578 | "en-x-rtl" => "en-rtl", |
579 | "es-x-formal" => "es-formal", |
580 | "hu-x-formal" => "hu-formal", |
581 | "jv-x-bms" => "map-bms", |
582 | "ro-cyrl-md" => "mo", |
583 | "nrf" => "nrm", |
584 | "nl-x-informal" => "nl-informal", |
585 | "nap-x-tara" => "roa-tara", |
586 | "en-simple" => "simple", |
587 | "sr-cyrl" => "sr-ec", |
588 | "sr-latn" => "sr-el", |
589 | "zh-hans-cn" => "zh-cn", |
590 | "zh-hans-sg" => "zh-sg", |
591 | "zh-hans-my" => "zh-my", |
592 | "zh-hant-tw" => "zh-tw", |
593 | "zh-hant-hk" => "zh-hk", |
594 | "zh-hant-mo" => "zh-mo", |
595 | ]; |
596 | if ( $code instanceof Bcp47Code ) { |
597 | $code = $code->toBcp47Code(); |
598 | } |
599 | $code = strtolower( $code ); // All MW-internal codes are lowercase |
600 | return $MAP[$code] ?? $code; |
601 | } |
602 | |
603 | /** |
604 | * Convert MediaWiki-internal language code to a BCP-47-compliant |
605 | * language code suitable for including in HTML. |
606 | * |
607 | * This is a temporary back-compatibility hack, needed for compatibility |
608 | * when running in standalone mode with MediaWiki Action APIs which expose |
609 | * internal language codes. These APIs should eventually be improved |
610 | * so that they also expose BCP-47 compliant codes, which can then be |
611 | * used directly by Parsoid without conversion. But until that day |
612 | * comes, this function will paper over the differences. |
613 | * |
614 | * Note that MediaWiki-internal Language objects implement Bcp47Code, |
615 | * so we can transition interfaces which currently take a string code |
616 | * to pass a Language object instead; that will make this method |
617 | * effectively a no-op and avoid the issue of upstream sync of the |
618 | * mapping table. |
619 | * |
620 | * @param string|Bcp47Code $code MediaWiki-internal language code or object |
621 | * @param bool $strict If true, this code will log a deprecation message |
622 | * or fail if a MediaWiki-internal language code is passed. |
623 | * @param ?LoggerInterface $warnLogger A deprecation warning will be |
624 | * emitted on $warnLogger if $strict is true and a string-valued |
625 | * MediaWiki-internal language code is passed; otherwise an exception |
626 | * will be thrown. |
627 | * @return Bcp47Code BCP-47 language code. |
628 | * @see LanguageCode::bcp47() |
629 | */ |
630 | public static function mwCodeToBcp47( |
631 | $code, bool $strict = false, ?LoggerInterface $warnLogger = null |
632 | ): Bcp47Code { |
633 | if ( $code instanceof Bcp47Code ) { |
634 | return $code; |
635 | } |
636 | if ( $strict ) { |
637 | $msg = "Use of string-valued BCP-47 codes is deprecated."; |
638 | if ( defined( 'MW_PHPUNIT_TEST' ) || defined( 'MW_PARSER_TEST' ) ) { |
639 | // Always throw an error if running tests |
640 | throw new \Error( $msg ); |
641 | } |
642 | if ( $warnLogger ) { |
643 | $warnLogger->warning( $msg ); |
644 | } else { |
645 | // Strict mode requested but no deprecation logger provided |
646 | throw new \Error( $msg ); |
647 | } |
648 | } |
649 | // This map is dumped from |
650 | // LanguageCode::getNonstandardLanguageCodeMapping() in core. |
651 | // Hopefully we will be able to deprecate and remove this method |
652 | // from Parsoid quickly enough that keeping it in sync with upstream |
653 | // will not be an issue. |
654 | static $MAP = [ |
655 | "als" => "gsw", |
656 | "bat-smg" => "sgs", |
657 | "be-x-old" => "be-tarask", |
658 | "fiu-vro" => "vro", |
659 | "roa-rup" => "rup", |
660 | "zh-classical" => "lzh", |
661 | "zh-min-nan" => "nan", |
662 | "zh-yue" => "yue", |
663 | "cbk-zam" => "cbk", |
664 | "de-formal" => "de-x-formal", |
665 | "eml" => "egl", |
666 | "en-rtl" => "en-x-rtl", |
667 | "es-formal" => "es-x-formal", |
668 | "hu-formal" => "hu-x-formal", |
669 | "map-bms" => "jv-x-bms", |
670 | "mo" => "ro-Cyrl-MD", |
671 | "nrm" => "nrf", |
672 | "nl-informal" => "nl-x-informal", |
673 | "roa-tara" => "nap-x-tara", |
674 | "simple" => "en-simple", |
675 | "sr-ec" => "sr-Cyrl", |
676 | "sr-el" => "sr-Latn", |
677 | "zh-cn" => "zh-Hans-CN", |
678 | "zh-sg" => "zh-Hans-SG", |
679 | "zh-my" => "zh-Hans-MY", |
680 | "zh-tw" => "zh-Hant-TW", |
681 | "zh-hk" => "zh-Hant-HK", |
682 | "zh-mo" => "zh-Hant-MO", |
683 | ]; |
684 | $code = $MAP[$code] ?? $code; |
685 | // The rest of this code is copied verbatim from LanguageCode::bcp47() |
686 | // in core. |
687 | $codeSegment = explode( '-', $code ); |
688 | $codeBCP = []; |
689 | foreach ( $codeSegment as $segNo => $seg ) { |
690 | // when previous segment is x, it is a private segment and should be lc |
691 | if ( $segNo > 0 && strtolower( $codeSegment[( $segNo - 1 )] ) == 'x' ) { |
692 | $codeBCP[$segNo] = strtolower( $seg ); |
693 | // ISO 3166 country code |
694 | } elseif ( ( strlen( $seg ) == 2 ) && ( $segNo > 0 ) ) { |
695 | $codeBCP[$segNo] = strtoupper( $seg ); |
696 | // ISO 15924 script code |
697 | } elseif ( ( strlen( $seg ) == 4 ) && ( $segNo > 0 ) ) { |
698 | $codeBCP[$segNo] = ucfirst( strtolower( $seg ) ); |
699 | // Use lowercase for other cases |
700 | } else { |
701 | $codeBCP[$segNo] = strtolower( $seg ); |
702 | } |
703 | } |
704 | return new Bcp47CodeValue( implode( '-', $codeBCP ) ); |
705 | } |
706 | |
707 | /** |
708 | * BCP 47 codes are case-insensitive, so this helper does a "proper" |
709 | * comparison of Bcp47Code objects. |
710 | * @param Bcp47Code $a |
711 | * @param Bcp47Code $b |
712 | * @return bool true iff $a and $b represent the same language |
713 | */ |
714 | public static function isBcp47CodeEqual( Bcp47Code $a, Bcp47Code $b ): bool { |
715 | return strcasecmp( $a->toBcp47Code(), $b->toBcp47Code() ) === 0; |
716 | } |
717 | } |