Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
13.58% |
33 / 243 |
|
21.43% |
6 / 28 |
CRAP | |
0.00% |
0 / 1 |
Utils | |
13.58% |
33 / 243 |
|
21.43% |
6 / 28 |
3903.67 | |
0.00% |
0 / 1 |
stripParsoidIdPrefix | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
stripNamespace | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
isParsoidObjectId | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
isVoidElement | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
recursiveClone | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
clone | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
56 | |||
lastUniChar | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
isUniWord | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
phpURLEncode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
decodeURI | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
decodeURIComponent | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
3 | |||
extractExtBody | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
isValidOffset | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
isValidDSR | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
42 | |||
normalizeNamespaceName | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
decodeWtEntities | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
escapeWtEntities | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
2 | |||
escapeWt | |
0.00% |
0 / 53 |
|
0.00% |
0 / 1 |
42 | |||
escapeHtml | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
entityEncodeAll | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
isProtocolValid | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
getExtArgInfo | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
6 | |||
parseMediaDimensions | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
90 | |||
validateMediaParam | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
isLinkTrail | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
bcp47ToMwCode | |
0.00% |
0 / 26 |
|
0.00% |
0 / 1 |
6 | |||
mwCodeToBcp47 | |
0.00% |
0 / 51 |
|
0.00% |
0 / 1 |
182 | |||
isBcp47CodeEqual | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Psr\Log\LoggerInterface; |
7 | use Wikimedia\Bcp47Code\Bcp47Code; |
8 | use Wikimedia\Bcp47Code\Bcp47CodeValue; |
9 | use Wikimedia\Parsoid\Config\Env; |
10 | use Wikimedia\Parsoid\Config\SiteConfig; |
11 | use Wikimedia\Parsoid\Core\DomSourceRange; |
12 | use Wikimedia\Parsoid\Core\Sanitizer; |
13 | use Wikimedia\Parsoid\NodeData\DataMw; |
14 | use Wikimedia\Parsoid\Tokens\Token; |
15 | use Wikimedia\Parsoid\Wikitext\Consts; |
16 | |
17 | /** |
18 | * This file contains general utilities for token transforms. |
19 | */ |
20 | class Utils { |
21 | /** |
22 | * Regular expression fragment for matching wikitext comments. |
23 | * Meant for inclusion in other regular expressions. |
24 | */ |
25 | // Maintenance note: this is used in /x regexes so all whitespace and # should be escaped |
26 | public const COMMENT_REGEXP_FRAGMENT = '<!--(?>[\s\S]*?-->)'; |
27 | /** Regular fragment for matching a wikitext comment */ |
28 | public const COMMENT_REGEXP = '/' . self::COMMENT_REGEXP_FRAGMENT . '/'; |
29 | |
30 | /** |
31 | * Strip Parsoid id prefix from aboutID |
32 | * |
33 | * @param string $aboutId aboud ID string |
34 | * @return string |
35 | */ |
36 | public static function stripParsoidIdPrefix( string $aboutId ): string { |
37 | // 'mwt' is the prefix used for new ids |
38 | return preg_replace( '/^#?mwt/', '', $aboutId ); |
39 | } |
40 | |
41 | /** |
42 | * Strip PHP namespace from the fully qualified class name |
43 | * @param string $className |
44 | * @return string |
45 | */ |
46 | public static function stripNamespace( string $className ): string { |
47 | return preg_replace( '/.*\\\\/', '', $className ); |
48 | } |
49 | |
50 | /** |
51 | * Check for Parsoid id prefix in an aboutID string |
52 | * |
53 | * @param string $aboutId aboud ID string |
54 | * @return bool |
55 | */ |
56 | public static function isParsoidObjectId( string $aboutId ): bool { |
57 | // 'mwt' is the prefix used for new ids |
58 | return str_starts_with( $aboutId, '#mwt' ); |
59 | } |
60 | |
61 | /** |
62 | * Determine if the named tag is void (can not have content). |
63 | * |
64 | * @param string $name tag name |
65 | * @return bool |
66 | */ |
67 | public static function isVoidElement( string $name ): bool { |
68 | return isset( Consts::$HTML['VoidTags'][$name] ); |
69 | } |
70 | |
71 | /** |
72 | * recursive deep clones helper function |
73 | * |
74 | * @param object $el object |
75 | * @return object |
76 | */ |
77 | private static function recursiveClone( $el ) { |
78 | return self::clone( $el, true ); |
79 | } |
80 | |
81 | /** |
82 | * Deep clones by default. |
83 | * @param object|array $obj arrays or plain objects |
84 | * Tokens or DOM nodes shouldn't be passed in. |
85 | * |
86 | * CAVEAT: It looks like debugging methods pass in arrays |
87 | * that can have DOM nodes. So, for debugging purposes, |
88 | * we handle top-level DOM nodes or DOM nodes embedded in arrays |
89 | * But, this will miserably fail if an object embeds a DOM node. |
90 | * |
91 | * @param bool $deepClone |
92 | * @param bool $debug |
93 | * @return object|array |
94 | */ |
95 | public static function clone( $obj, $deepClone = true, $debug = false ) { |
96 | if ( $debug ) { |
97 | if ( $obj instanceof \DOMNode ) { |
98 | return $obj->cloneNode( $deepClone ); |
99 | } |
100 | if ( is_array( $obj ) ) { |
101 | if ( $deepClone ) { |
102 | return array_map( |
103 | static function ( $o ) { |
104 | return Utils::clone( $o, true, true ); |
105 | }, |
106 | $obj |
107 | ); |
108 | } else { |
109 | return $obj; // Copy-on-write cloning |
110 | } |
111 | } |
112 | } |
113 | |
114 | if ( !$deepClone && is_object( $obj ) ) { |
115 | return clone $obj; |
116 | } |
117 | |
118 | // FIXME, see T161647 |
119 | // This will fail if $obj is (or embeds) a DOMNode |
120 | return unserialize( serialize( $obj ) ); |
121 | } |
122 | |
123 | /** |
124 | * Extract the last *unicode* character of the string. |
125 | * This might be more than one byte, if the last character |
126 | * is non-ASCII. |
127 | * @param string $str |
128 | * @param ?int $idx The index *after* the character to extract; defaults |
129 | * to the length of $str, which will extract the last character in |
130 | * $str. |
131 | * @return string |
132 | */ |
133 | public static function lastUniChar( string $str, ?int $idx = null ): string { |
134 | if ( $idx === null ) { |
135 | $idx = strlen( $str ); |
136 | } elseif ( $idx <= 0 || $idx > strlen( $str ) ) { |
137 | return ''; |
138 | } |
139 | $c = $str[--$idx]; |
140 | while ( ( ord( $c ) & 0xC0 ) === 0x80 ) { |
141 | $c = $str[--$idx] . $c; |
142 | } |
143 | return $c; |
144 | } |
145 | |
146 | /** |
147 | * Return true if the first character in $s is a unicode word character. |
148 | * @param string $s |
149 | * @return bool |
150 | */ |
151 | public static function isUniWord( string $s ): bool { |
152 | return preg_match( '#^\w#u', $s ) === 1; |
153 | } |
154 | |
155 | /** |
156 | * This should not be used. |
157 | * @param string $txt URL to encode using PHP encoding |
158 | * @return string |
159 | */ |
160 | public static function phpURLEncode( $txt ) { |
161 | // @phan-suppress-previous-line PhanPluginNeverReturnMethod |
162 | throw new \BadMethodCallException( 'Use urlencode( $txt ) instead' ); |
163 | } |
164 | |
165 | /** |
166 | * Percent-decode only valid UTF-8 characters, leaving other encoded bytes alone. |
167 | * |
168 | * Distinct from `decodeURIComponent` in that certain escapes are not decoded, |
169 | * matching the behavior of JavaScript's decodeURI(). |
170 | * |
171 | * @see https://www.ecma-international.org/ecma-262/6.0/#sec-decodeuri-encodeduri |
172 | * @param string $s URI to be decoded |
173 | * @return string |
174 | */ |
175 | public static function decodeURI( string $s ): string { |
176 | // Escape the '%' in sequences for the reserved characters, then use decodeURIComponent. |
177 | $s = preg_replace( '/%(?=2[346bcfBCF]|3[abdfABDF]|40)/', '%25', $s ); |
178 | return self::decodeURIComponent( $s ); |
179 | } |
180 | |
181 | /** |
182 | * Percent-decode only valid UTF-8 characters, leaving other encoded bytes alone. |
183 | * |
184 | * @param string $s URI to be decoded |
185 | * @return string |
186 | */ |
187 | public static function decodeURIComponent( string $s ): string { |
188 | // Most of the time we should have valid input |
189 | $ret = rawurldecode( $s ); |
190 | if ( mb_check_encoding( $ret, 'UTF-8' ) ) { |
191 | return $ret; |
192 | } |
193 | |
194 | // Extract each encoded character and decode it individually |
195 | return preg_replace_callback( |
196 | // phpcs:ignore Generic.Files.LineLength.TooLong |
197 | '/%[0-7][0-9A-F]|%[CD][0-9A-F]%[89AB][0-9A-F]|%E[0-9A-F](?:%[89AB][0-9A-F]){2}|%F[0-4](?:%[89AB][0-9A-F]){3}/i', |
198 | static function ( $match ) { |
199 | $ret = rawurldecode( $match[0] ); |
200 | return mb_check_encoding( $ret, 'UTF-8' ) ? $ret : $match[0]; |
201 | }, $s |
202 | ); |
203 | } |
204 | |
205 | /** |
206 | * Extract extension source from the token |
207 | * |
208 | * @param Token $token token |
209 | * @return string |
210 | */ |
211 | public static function extractExtBody( Token $token ): string { |
212 | $src = $token->getAttributeV( 'source' ); |
213 | $extTagOffsets = $token->dataParsoid->extTagOffsets; |
214 | '@phan-var \Wikimedia\Parsoid\Core\DomSourceRange $extTagOffsets'; |
215 | return $extTagOffsets->stripTags( $src ); |
216 | } |
217 | |
218 | /** |
219 | * Helper function checks numeric values |
220 | * |
221 | * @param ?int $n checks parameters for numeric type and value zero or positive |
222 | * @return bool |
223 | */ |
224 | private static function isValidOffset( ?int $n ): bool { |
225 | return $n !== null && $n >= 0; |
226 | } |
227 | |
228 | /** |
229 | * Basic check if a DOM Source Range (DSR) is valid. |
230 | * |
231 | * Clarifications about the "basic validity checks": |
232 | * - Only checks for underflow, not for overflow. |
233 | * - Does not verify that start <= end |
234 | * - Does not verify that openWidth + endWidth <= end - start |
235 | * (even so, the values might be invalid because of content) |
236 | * These would be overkill for our purposes. Given how DSR computation |
237 | * works in thie codebase, the real scenarios we care about are |
238 | * non-null / non-negative values since that can happen. |
239 | * |
240 | * @param ?DomSourceRange $dsr DSR source range values |
241 | * @param bool $all Also check the widths of the container tag |
242 | * @return bool |
243 | */ |
244 | public static function isValidDSR( |
245 | ?DomSourceRange $dsr, bool $all = false |
246 | ): bool { |
247 | return $dsr !== null && |
248 | self::isValidOffset( $dsr->start ) && |
249 | self::isValidOffset( $dsr->end ) && |
250 | ( !$all || ( |
251 | self::isValidOffset( $dsr->openWidth ) && |
252 | self::isValidOffset( $dsr->closeWidth ) |
253 | ) |
254 | ); |
255 | } |
256 | |
257 | /** |
258 | * Cannonicalizes a namespace name. |
259 | * |
260 | * @param string $name Non-normalized namespace name. |
261 | * @return string |
262 | */ |
263 | public static function normalizeNamespaceName( string $name ): string { |
264 | return strtr( mb_strtolower( $name ), ' ', '_' ); |
265 | } |
266 | |
267 | /** |
268 | * Decode HTML5 entities in wikitext. |
269 | * |
270 | * NOTE that wikitext only allows semicolon-terminated entities, while |
271 | * HTML allows a number of "legacy" entities to be decoded without |
272 | * a terminating semicolon. This function deliberately does not |
273 | * decode these HTML-only entity forms. |
274 | * |
275 | * @param string $text |
276 | * @return string |
277 | */ |
278 | public static function decodeWtEntities( string $text ): string { |
279 | // Note that HTML5 allows semicolon-less entities which |
280 | // wikitext does not: in wikitext all entities must end in a |
281 | // semicolon. |
282 | // By normalizing before decoding, this routine deliberately |
283 | // does not decode entity references which are invalid in wikitext |
284 | // (mostly because they decode to invalid codepoints). |
285 | return Sanitizer::decodeCharReferences( |
286 | Sanitizer::normalizeCharReferences( $text ) |
287 | ); |
288 | } |
289 | |
290 | /** |
291 | * Entity-escape anything that would decode to a valid wikitext entity. |
292 | * |
293 | * Note that HTML5 allows certain "semicolon-less" entities, like |
294 | * `¶`; these aren't allowed in wikitext and won't be escaped |
295 | * by this function. |
296 | * |
297 | * @param string $text |
298 | * @return string |
299 | */ |
300 | public static function escapeWtEntities( string $text ): string { |
301 | // We just want to encode ampersands that precede valid entities. |
302 | // (And note that semicolon-less entities aren't valid wikitext.) |
303 | return preg_replace_callback( '/&[#0-9a-zA-Z\x80-\xff]+;/', function ( $match ) { |
304 | $m = $match[0]; |
305 | $decodedChar = self::decodeWtEntities( $m ); |
306 | if ( $decodedChar !== $m ) { |
307 | // Escape the ampersand |
308 | return '&' . substr( $m, 1 ); |
309 | } else { |
310 | // Not an entity, just return the string |
311 | return $m; |
312 | } |
313 | }, $text ); |
314 | } |
315 | |
316 | /** |
317 | * Ensure that the given literal string is safe to parse as wikitext. |
318 | * See wfEscapeWikiText() in core. |
319 | */ |
320 | public static function escapeWt( string $input ): string { |
321 | static $repl = null, $repl2 = null, $repl3 = null, $repl4 = null; |
322 | if ( $repl === null ) { |
323 | $repl = [ |
324 | '"' => '"', '&' => '&', "'" => ''', '<' => '<', |
325 | '=' => '=', '>' => '>', '[' => '[', ']' => ']', |
326 | '{' => '{', '|' => '|', '}' => '}', |
327 | ';' => ';', // a token inside language converter brackets |
328 | '!!' => '!!', // a token inside table context |
329 | "\n!" => "\n!", "\r!" => "\r!", // a token inside table context |
330 | "\n#" => "\n#", "\r#" => "\r#", |
331 | "\n*" => "\n*", "\r*" => "\r*", |
332 | "\n:" => "\n:", "\r:" => "\r:", |
333 | "\n " => "\n ", "\r " => "\r ", |
334 | "\n\n" => "\n ", "\r\n" => " \n", |
335 | "\n\r" => "\n ", "\r\r" => "\r ", |
336 | "\n\t" => "\n	", "\r\t" => "\r	", // "\n\t\n" is treated like "\n\n" |
337 | "\n----" => "\n----", "\r----" => "\r----", |
338 | '__' => '__', '://' => '://', |
339 | '~~~' => '~~~', // protect from PST, just to be safe(r) |
340 | ]; |
341 | |
342 | $magicLinks = [ 'ISBN', 'PMID', 'RFC' ]; |
343 | // We have to catch everything "\s" matches in PCRE |
344 | foreach ( $magicLinks as $magic ) { |
345 | $repl["$magic "] = "$magic "; |
346 | $repl["$magic\t"] = "$magic	"; |
347 | $repl["$magic\r"] = "$magic "; |
348 | $repl["$magic\n"] = "$magic "; |
349 | $repl["$magic\f"] = "$magic"; |
350 | } |
351 | // Additionally escape the following characters at the beginning of the |
352 | // string, in case they merge to form tokens when spliced into a |
353 | // string. Tokens like -{ {{ [[ {| etc are already escaped because |
354 | // the second character is escaped above, but the following tokens |
355 | // are handled here: |+ |- __FOO__ ~~~ |
356 | $repl3 = [ |
357 | '+' => '+', '-' => '-', '_' => '_', '~' => '~', |
358 | ]; |
359 | // Similarly, protect the following characters at the end of the |
360 | // string, which could turn form the start of `__FOO__` or `~~~~` |
361 | // A trailing newline could also form the unintended start of a |
362 | // paragraph break if it is glued to a newline in the following |
363 | // context. |
364 | $repl4 = [ |
365 | '_' => '_', '~' => '~', |
366 | "\n" => " ", "\r" => " ", |
367 | "\t" => "	", // "\n\t\n" is treated like "\n\n" |
368 | ]; |
369 | |
370 | // And handle protocols that don't use "://" |
371 | $urlProtocols = [ |
372 | 'bitcoin:', 'geo:', 'magnet:', 'mailto:', 'matrix:', 'news:', |
373 | 'sip:', 'sips:', 'sms:', 'tel:', 'urn:', 'xmpp:', |
374 | ]; |
375 | $repl2 = []; |
376 | foreach ( $urlProtocols as $prot ) { |
377 | $repl2[] = preg_quote( substr( $prot, 0, -1 ), '/' ); |
378 | } |
379 | $repl2 = '/\b(' . implode( '|', $repl2 ) . '):/i'; |
380 | } |
381 | // Tell phan that $repl2, $repl3 and $repl4 will also be non-null here |
382 | '@phan-var string $repl2'; |
383 | '@phan-var string $repl3'; |
384 | '@phan-var string $repl4'; |
385 | // This will also stringify input in case it's not a string |
386 | $text = substr( strtr( "\n$input", $repl ), 1 ); |
387 | if ( $text === '' ) { |
388 | return $text; |
389 | } |
390 | $first = strtr( $text[0], $repl3 ); // protect first character |
391 | if ( strlen( $text ) > 1 ) { |
392 | $text = $first . substr( $text, 1, -1 ) . |
393 | strtr( substr( $text, -1 ), $repl4 ); // protect last character |
394 | } else { |
395 | // special case for single-character strings |
396 | $text = strtr( $first, $repl4 ); // protect last character |
397 | } |
398 | $text = preg_replace( $repl2, '$1:', $text ); |
399 | return $text; |
400 | } |
401 | |
402 | /** |
403 | * Convert special characters to HTML entities |
404 | * |
405 | * @param string $s |
406 | * @return string |
407 | */ |
408 | public static function escapeHtml( string $s ): string { |
409 | // Only encodes five characters: " ' & < > |
410 | return htmlspecialchars( $s, ENT_QUOTES | ENT_HTML5 ); |
411 | } |
412 | |
413 | /** |
414 | * Encode all characters as entity references. This is done to make |
415 | * characters safe for wikitext (regardless of whether they are |
416 | * HTML-safe). Typically only called with single-codepoint strings. |
417 | * @param string $s |
418 | * @return string |
419 | */ |
420 | public static function entityEncodeAll( string $s ): string { |
421 | // This is Unicode aware. |
422 | static $conventions = [ |
423 | // We always use at least two characters for the hex code |
424 | '�' => '�', '' => '', '' => '', '' => '', |
425 | '' => '', '' => '', '' => '', '' => '', |
426 | '' => '', '	' => '	', '
' => '
', '' => '', |
427 | '' => '', '
' => '
', '' => '', '' => '', |
428 | // By convention we use where possible |
429 | ' ' => ' ', |
430 | ]; |
431 | |
432 | return strtr( mb_encode_numericentity( |
433 | $s, [ 0, 0x10ffff, 0, ~0 ], 'utf-8', true |
434 | ), $conventions ); |
435 | } |
436 | |
437 | /** |
438 | * Determine whether the protocol of a link is potentially valid. Use the |
439 | * environment's per-wiki config to do so. |
440 | * |
441 | * @param mixed $linkTarget |
442 | * @param Env $env |
443 | * @return bool |
444 | */ |
445 | public static function isProtocolValid( $linkTarget, Env $env ): bool { |
446 | $siteConf = $env->getSiteConfig(); |
447 | if ( is_string( $linkTarget ) ) { |
448 | return $siteConf->hasValidProtocol( $linkTarget ); |
449 | } else { |
450 | return true; |
451 | } |
452 | } |
453 | |
454 | /** |
455 | * Get argument information for an extension tag token. |
456 | * |
457 | * @param Token $extToken |
458 | * @return DataMw |
459 | */ |
460 | public static function getExtArgInfo( Token $extToken ): DataMw { |
461 | $name = $extToken->getAttributeV( 'name' ); |
462 | $options = $extToken->getAttributeV( 'options' ); |
463 | $defaultDataMw = new DataMw( [ |
464 | 'name' => $name, |
465 | // T367616: 'attrs' should be renamed to 'extAttrs' |
466 | 'attrs' => (object)TokenUtils::kvToHash( $options ), |
467 | ] ); |
468 | $extTagOffsets = $extToken->dataParsoid->extTagOffsets; |
469 | if ( $extTagOffsets->closeWidth !== 0 ) { |
470 | // If not self-closing... |
471 | $defaultDataMw->body = (object)[ |
472 | 'extsrc' => self::extractExtBody( $extToken ), |
473 | ]; |
474 | } |
475 | return $defaultDataMw; |
476 | } |
477 | |
478 | /** |
479 | * Parse media dimensions |
480 | * |
481 | * @param SiteConfig $siteConfig |
482 | * @param string $str media dimension string to parse |
483 | * @param bool $onlyOne If set, returns null if multiple dimenstions are present |
484 | * @param bool $localized Defaults to false; set to true if the $str |
485 | * has already been matched against `img_width` to localize the `px` |
486 | * suffix. |
487 | * @return ?array{x:int,y?:int,bogusPx:bool} |
488 | */ |
489 | public static function parseMediaDimensions( |
490 | SiteConfig $siteConfig, string $str, bool $onlyOne = false, |
491 | bool $localized = false |
492 | ): ?array { |
493 | if ( !$localized ) { |
494 | $getOption = $siteConfig->getMediaPrefixParameterizedAliasMatcher(); |
495 | $bits = $getOption( $str ); |
496 | $normalizedBit0 = $bits ? mb_strtolower( trim( $bits['k'] ) ) : null; |
497 | if ( $normalizedBit0 === 'img_width' ) { |
498 | $str = $bits['v']; |
499 | } |
500 | } |
501 | $dimensions = null; |
502 | // We support a trailing 'px' here for historical reasons |
503 | // (T15500, T53628, T207032) |
504 | if ( preg_match( '/^(\d*)(?:x(\d+))?\s*(px\s*)?$/D', $str, $match ) ) { |
505 | $dimensions = [ 'x' => null, 'y' => null, 'bogusPx' => false ]; |
506 | if ( !empty( $match[1] ) ) { |
507 | $dimensions['x'] = intval( $match[1], 10 ); |
508 | } |
509 | if ( !empty( $match[2] ) ) { |
510 | if ( $onlyOne ) { |
511 | return null; |
512 | } |
513 | $dimensions['y'] = intval( $match[2], 10 ); |
514 | } |
515 | if ( !empty( $match[3] ) ) { |
516 | $dimensions['bogusPx'] = true; |
517 | } |
518 | } |
519 | return $dimensions; |
520 | } |
521 | |
522 | /** |
523 | * Validate media parameters |
524 | * More generally, this is defined by the media handler in core |
525 | * |
526 | * @param ?int $num |
527 | * @return bool |
528 | */ |
529 | public static function validateMediaParam( ?int $num ): bool { |
530 | return $num !== null && $num > 0; |
531 | } |
532 | |
533 | /** |
534 | * This regex was generated by running through *all unicode characters* and |
535 | * testing them against *all regexes* for linktrails in a default MW install. |
536 | * We had to treat it a little bit, here's what we changed: |
537 | * |
538 | * 1. A-Z, though allowed in Walloon, is disallowed. |
539 | * 2. '"', though allowed in Chuvash, is disallowed. |
540 | * 3. '-', though allowed in Icelandic (possibly due to a bug), is disallowed. |
541 | * 4. '1', though allowed in Lak (possibly due to a bug), is disallowed. |
542 | */ |
543 | // phpcs:disable Generic.Files.LineLength.TooLong |
544 | public static $linkTrailRegex = |
545 | '/^[^\0-`{÷ĀĈ-ČĎĐĒĔĖĚĜĝĠ-ĪĬ-įIJĴ-ĹĻ-ĽĿŀŅņʼnŊŌŎŏŒŔŖ-ŘŜŝŠŤŦŨŪ-ŬŮŲ-ŴŶŸ' . |
546 | 'ſ-ǤǦǨǪ-Ǯǰ-ȗȜ-ȞȠ-ɘɚ-ʑʓ-ʸʽ-̂̄-΅·Ϗ-ЯѐѝѠѢѤѦѨѪѬѮѰѲѴѶѸѺ-ѾҀ-҃҅-ҐҒҔҕҘҚҜ-ҠҤ-ҪҬҭҰҲ' . |
547 | 'Ҵ-ҶҸҹҼ-ҿӁ-ӗӚ-ӜӞӠ-ӢӤӦӪ-ӲӴӶ-ՠֈ--ؠً-ٳٵ-ٽٿ-څڇ-ڗڙ-ڨڪ-ڬڮڰ-ڽڿ-ۅۈ-ۊۍ-۔ۖ--' . |
548 | '---੯ੴ-ჱ-ẼẾ-\x{200b}\x{200d}-‒—-‗‚‛”--\x{fffd}]+$/D'; |
549 | // phpcs:enable Generic.Files.LineLength.TooLong |
550 | |
551 | /** |
552 | * Check whether some text is a valid link trail. |
553 | * |
554 | * @param string $text |
555 | * @return bool |
556 | */ |
557 | public static function isLinkTrail( string $text ): bool { |
558 | return $text !== '' && preg_match( self::$linkTrailRegex, $text ); |
559 | } |
560 | |
561 | /** |
562 | * Convert BCP-47-compliant language code to MediaWiki-internal code. |
563 | * |
564 | * This is a temporary back-compatibility hack; Parsoid should be |
565 | * using BCP 47 strings or Bcp47Code objects in all its external APIs. |
566 | * Try to avoid using it, though: there's no guarantee |
567 | * that this mapping will remain in sync with upstream. |
568 | * |
569 | * @param string|Bcp47Code $code BCP-47 language code |
570 | * @return string MediaWiki-internal language code |
571 | */ |
572 | public static function bcp47ToMwCode( $code ): string { |
573 | // This map is dumped from |
574 | // LanguageCode::NON_STANDARD_LANGUAGE_CODE_MAPPING in core, but |
575 | // with keys and values swapped and BCP-47 codes lowercased: |
576 | // |
577 | // array_flip(array_map(strtolower, |
578 | // LanguageCode::NON_STANDARD_LANGUAGE_CODE_MAPPING)) |
579 | // |
580 | // Hopefully we will be able to deprecate and remove this from |
581 | // Parsoid quickly enough that keeping it in sync with upstream |
582 | // is not an issue. |
583 | static $MAP = [ |
584 | "cbk" => "cbk-zam", |
585 | "de-x-formal" => "de-formal", |
586 | "egl" => "eml", |
587 | "en-x-rtl" => "en-rtl", |
588 | "es-x-formal" => "es-formal", |
589 | "hu-x-formal" => "hu-formal", |
590 | "jv-x-bms" => "map-bms", |
591 | "ro-cyrl-md" => "mo", |
592 | "nrf" => "nrm", |
593 | "nl-x-informal" => "nl-informal", |
594 | "nap-x-tara" => "roa-tara", |
595 | "en-simple" => "simple", |
596 | "sr-cyrl" => "sr-ec", |
597 | "sr-latn" => "sr-el", |
598 | "zh-hans-cn" => "zh-cn", |
599 | "zh-hans-sg" => "zh-sg", |
600 | "zh-hans-my" => "zh-my", |
601 | "zh-hant-tw" => "zh-tw", |
602 | "zh-hant-hk" => "zh-hk", |
603 | "zh-hant-mo" => "zh-mo", |
604 | ]; |
605 | if ( $code instanceof Bcp47Code ) { |
606 | $code = $code->toBcp47Code(); |
607 | } |
608 | $code = strtolower( $code ); // All MW-internal codes are lowercase |
609 | return $MAP[$code] ?? $code; |
610 | } |
611 | |
612 | /** |
613 | * Convert MediaWiki-internal language code to a BCP-47-compliant |
614 | * language code suitable for including in HTML. |
615 | * |
616 | * This is a temporary back-compatibility hack, needed for compatibility |
617 | * when running in standalone mode with MediaWiki Action APIs which expose |
618 | * internal language codes. These APIs should eventually be improved |
619 | * so that they also expose BCP-47 compliant codes, which can then be |
620 | * used directly by Parsoid without conversion. But until that day |
621 | * comes, this function will paper over the differences. |
622 | * |
623 | * Note that MediaWiki-internal Language objects implement Bcp47Code, |
624 | * so we can transition interfaces which currently take a string code |
625 | * to pass a Language object instead; that will make this method |
626 | * effectively a no-op and avoid the issue of upstream sync of the |
627 | * mapping table. |
628 | * |
629 | * @param string|Bcp47Code $code MediaWiki-internal language code or object |
630 | * @param bool $strict If true, this code will log a deprecation message |
631 | * or fail if a MediaWiki-internal language code is passed. |
632 | * @param ?LoggerInterface $warnLogger A deprecation warning will be |
633 | * emitted on $warnLogger if $strict is true and a string-valued |
634 | * MediaWiki-internal language code is passed; otherwise an exception |
635 | * will be thrown. |
636 | * @return Bcp47Code BCP-47 language code. |
637 | * @see LanguageCode::bcp47() |
638 | */ |
639 | public static function mwCodeToBcp47( |
640 | $code, bool $strict = false, ?LoggerInterface $warnLogger = null |
641 | ): Bcp47Code { |
642 | if ( $code instanceof Bcp47Code ) { |
643 | return $code; |
644 | } |
645 | if ( $strict ) { |
646 | $msg = "Use of string-valued BCP-47 codes is deprecated."; |
647 | if ( defined( 'MW_PHPUNIT_TEST' ) || defined( 'MW_PARSER_TEST' ) ) { |
648 | // Always throw an error if running tests |
649 | throw new \Error( $msg ); |
650 | } |
651 | if ( $warnLogger ) { |
652 | $warnLogger->warning( $msg ); |
653 | } else { |
654 | // Strict mode requested but no deprecation logger provided |
655 | throw new \Error( $msg ); |
656 | } |
657 | } |
658 | // This map is dumped from |
659 | // LanguageCode::getNonstandardLanguageCodeMapping() in core. |
660 | // Hopefully we will be able to deprecate and remove this method |
661 | // from Parsoid quickly enough that keeping it in sync with upstream |
662 | // will not be an issue. |
663 | static $MAP = [ |
664 | "als" => "gsw", |
665 | "bat-smg" => "sgs", |
666 | "be-x-old" => "be-tarask", |
667 | "fiu-vro" => "vro", |
668 | "roa-rup" => "rup", |
669 | "zh-classical" => "lzh", |
670 | "zh-min-nan" => "nan", |
671 | "zh-yue" => "yue", |
672 | "cbk-zam" => "cbk", |
673 | "de-formal" => "de-x-formal", |
674 | "eml" => "egl", |
675 | "en-rtl" => "en-x-rtl", |
676 | "es-formal" => "es-x-formal", |
677 | "hu-formal" => "hu-x-formal", |
678 | "map-bms" => "jv-x-bms", |
679 | "mo" => "ro-Cyrl-MD", |
680 | "nrm" => "nrf", |
681 | "nl-informal" => "nl-x-informal", |
682 | "roa-tara" => "nap-x-tara", |
683 | "simple" => "en-simple", |
684 | "sr-ec" => "sr-Cyrl", |
685 | "sr-el" => "sr-Latn", |
686 | "zh-cn" => "zh-Hans-CN", |
687 | "zh-sg" => "zh-Hans-SG", |
688 | "zh-my" => "zh-Hans-MY", |
689 | "zh-tw" => "zh-Hant-TW", |
690 | "zh-hk" => "zh-Hant-HK", |
691 | "zh-mo" => "zh-Hant-MO", |
692 | ]; |
693 | $code = $MAP[$code] ?? $code; |
694 | // The rest of this code is copied verbatim from LanguageCode::bcp47() |
695 | // in core. |
696 | $codeSegment = explode( '-', $code ); |
697 | $codeBCP = []; |
698 | foreach ( $codeSegment as $segNo => $seg ) { |
699 | // when previous segment is x, it is a private segment and should be lc |
700 | if ( $segNo > 0 && strtolower( $codeSegment[( $segNo - 1 )] ) == 'x' ) { |
701 | $codeBCP[$segNo] = strtolower( $seg ); |
702 | // ISO 3166 country code |
703 | } elseif ( ( strlen( $seg ) == 2 ) && ( $segNo > 0 ) ) { |
704 | $codeBCP[$segNo] = strtoupper( $seg ); |
705 | // ISO 15924 script code |
706 | } elseif ( ( strlen( $seg ) == 4 ) && ( $segNo > 0 ) ) { |
707 | $codeBCP[$segNo] = ucfirst( strtolower( $seg ) ); |
708 | // Use lowercase for other cases |
709 | } else { |
710 | $codeBCP[$segNo] = strtolower( $seg ); |
711 | } |
712 | } |
713 | return new Bcp47CodeValue( implode( '-', $codeBCP ) ); |
714 | } |
715 | |
716 | /** |
717 | * BCP 47 codes are case-insensitive, so this helper does a "proper" |
718 | * comparison of Bcp47Code objects. |
719 | * @param Bcp47Code $a |
720 | * @param Bcp47Code $b |
721 | * @return bool true iff $a and $b represent the same language |
722 | */ |
723 | public static function isBcp47CodeEqual( Bcp47Code $a, Bcp47Code $b ): bool { |
724 | return strcasecmp( $a->toBcp47Code(), $b->toBcp47Code() ) === 0; |
725 | } |
726 | } |