Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
17.74% |
33 / 186 |
|
21.43% |
6 / 28 |
CRAP | |
0.00% |
0 / 1 |
Utils | |
17.74% |
33 / 186 |
|
21.43% |
6 / 28 |
2718.93 | |
0.00% |
0 / 1 |
stripParsoidIdPrefix | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
stripNamespace | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
isParsoidObjectId | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
isVoidElement | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
recursiveClone | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
clone | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
56 | |||
lastUniChar | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
isUniWord | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
phpURLEncode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
decodeURI | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
decodeURIComponent | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
3 | |||
extractExtBody | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
isValidOffset | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
isValidDSR | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
42 | |||
normalizeNamespaceName | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
decodeWtEntities | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
escapeWtEntities | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
2 | |||
escapeHtml | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
entityEncodeAll | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
isProtocolValid | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
getExtArgInfo | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
6 | |||
parseMediaDimensions | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
42 | |||
validateMediaParam | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
getStar | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
isLinkTrail | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
bcp47ToMwCode | |
0.00% |
0 / 26 |
|
0.00% |
0 / 1 |
6 | |||
mwCodeToBcp47 | |
0.00% |
0 / 51 |
|
0.00% |
0 / 1 |
182 | |||
isBcp47CodeEqual | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Psr\Log\LoggerInterface; |
7 | use Wikimedia\Bcp47Code\Bcp47Code; |
8 | use Wikimedia\Bcp47Code\Bcp47CodeValue; |
9 | use Wikimedia\Parsoid\Config\Env; |
10 | use Wikimedia\Parsoid\Core\DomSourceRange; |
11 | use Wikimedia\Parsoid\Core\Sanitizer; |
12 | use Wikimedia\Parsoid\NodeData\DataMw; |
13 | use Wikimedia\Parsoid\Tokens\Token; |
14 | use Wikimedia\Parsoid\Wikitext\Consts; |
15 | |
16 | /** |
17 | * This file contains general utilities for token transforms. |
18 | */ |
19 | class Utils { |
20 | /** |
21 | * Regular expression fragment for matching wikitext comments. |
22 | * Meant for inclusion in other regular expressions. |
23 | */ |
24 | // Maintenance note: this is used in /x regexes so all whitespace and # should be escaped |
25 | public const COMMENT_REGEXP_FRAGMENT = '<!--(?>[\s\S]*?-->)'; |
26 | /** Regular fragment for matching a wikitext comment */ |
27 | public const COMMENT_REGEXP = '/' . self::COMMENT_REGEXP_FRAGMENT . '/'; |
28 | |
29 | /** |
30 | * Strip Parsoid id prefix from aboutID |
31 | * |
32 | * @param string $aboutId aboud ID string |
33 | * @return string |
34 | */ |
35 | public static function stripParsoidIdPrefix( string $aboutId ): string { |
36 | // 'mwt' is the prefix used for new ids |
37 | return preg_replace( '/^#?mwt/', '', $aboutId ); |
38 | } |
39 | |
40 | /** |
41 | * Strip PHP namespace from the fully qualified class name |
42 | * @param string $className |
43 | * @return string |
44 | */ |
45 | public static function stripNamespace( string $className ): string { |
46 | return preg_replace( '/.*\\\\/', '', $className ); |
47 | } |
48 | |
49 | /** |
50 | * Check for Parsoid id prefix in an aboutID string |
51 | * |
52 | * @param string $aboutId aboud ID string |
53 | * @return bool |
54 | */ |
55 | public static function isParsoidObjectId( string $aboutId ): bool { |
56 | // 'mwt' is the prefix used for new ids |
57 | return str_starts_with( $aboutId, '#mwt' ); |
58 | } |
59 | |
60 | /** |
61 | * Determine if the named tag is void (can not have content). |
62 | * |
63 | * @param string $name tag name |
64 | * @return bool |
65 | */ |
66 | public static function isVoidElement( string $name ): bool { |
67 | return isset( Consts::$HTML['VoidTags'][$name] ); |
68 | } |
69 | |
70 | /** |
71 | * recursive deep clones helper function |
72 | * |
73 | * @param object $el object |
74 | * @return object |
75 | */ |
76 | private static function recursiveClone( $el ) { |
77 | return self::clone( $el, true ); |
78 | } |
79 | |
80 | /** |
81 | * Deep clones by default. |
82 | * @param object|array $obj arrays or plain objects |
83 | * Tokens or DOM nodes shouldn't be passed in. |
84 | * |
85 | * CAVEAT: It looks like debugging methods pass in arrays |
86 | * that can have DOM nodes. So, for debugging purposes, |
87 | * we handle top-level DOM nodes or DOM nodes embedded in arrays |
88 | * But, this will miserably fail if an object embeds a DOM node. |
89 | * |
90 | * @param bool $deepClone |
91 | * @param bool $debug |
92 | * @return object|array |
93 | */ |
94 | public static function clone( $obj, $deepClone = true, $debug = false ) { |
95 | if ( $debug ) { |
96 | if ( $obj instanceof \DOMNode ) { |
97 | return $obj->cloneNode( $deepClone ); |
98 | } |
99 | if ( is_array( $obj ) ) { |
100 | if ( $deepClone ) { |
101 | return array_map( |
102 | static function ( $o ) { |
103 | return Utils::clone( $o, true, true ); |
104 | }, |
105 | $obj |
106 | ); |
107 | } else { |
108 | return $obj; // Copy-on-write cloning |
109 | } |
110 | } |
111 | } |
112 | |
113 | if ( !$deepClone && is_object( $obj ) ) { |
114 | return clone $obj; |
115 | } |
116 | |
117 | // FIXME, see T161647 |
118 | // This will fail if $obj is (or embeds) a DOMNode |
119 | return unserialize( serialize( $obj ) ); |
120 | } |
121 | |
122 | /** |
123 | * Extract the last *unicode* character of the string. |
124 | * This might be more than one byte, if the last character |
125 | * is non-ASCII. |
126 | * @param string $str |
127 | * @param ?int $idx The index *after* the character to extract; defaults |
128 | * to the length of $str, which will extract the last character in |
129 | * $str. |
130 | * @return string |
131 | */ |
132 | public static function lastUniChar( string $str, ?int $idx = null ): string { |
133 | if ( $idx === null ) { |
134 | $idx = strlen( $str ); |
135 | } elseif ( $idx <= 0 || $idx > strlen( $str ) ) { |
136 | return ''; |
137 | } |
138 | $c = $str[--$idx]; |
139 | while ( ( ord( $c ) & 0xC0 ) === 0x80 ) { |
140 | $c = $str[--$idx] . $c; |
141 | } |
142 | return $c; |
143 | } |
144 | |
145 | /** |
146 | * Return true if the first character in $s is a unicode word character. |
147 | * @param string $s |
148 | * @return bool |
149 | */ |
150 | public static function isUniWord( string $s ): bool { |
151 | return preg_match( '#^\w#u', $s ) === 1; |
152 | } |
153 | |
154 | /** |
155 | * This should not be used. |
156 | * @param string $txt URL to encode using PHP encoding |
157 | * @return string |
158 | */ |
159 | public static function phpURLEncode( $txt ) { |
160 | // @phan-suppress-previous-line PhanPluginNeverReturnMethod |
161 | throw new \BadMethodCallException( 'Use urlencode( $txt ) instead' ); |
162 | } |
163 | |
164 | /** |
165 | * Percent-decode only valid UTF-8 characters, leaving other encoded bytes alone. |
166 | * |
167 | * Distinct from `decodeURIComponent` in that certain escapes are not decoded, |
168 | * matching the behavior of JavaScript's decodeURI(). |
169 | * |
170 | * @see https://www.ecma-international.org/ecma-262/6.0/#sec-decodeuri-encodeduri |
171 | * @param string $s URI to be decoded |
172 | * @return string |
173 | */ |
174 | public static function decodeURI( string $s ): string { |
175 | // Escape the '%' in sequences for the reserved characters, then use decodeURIComponent. |
176 | $s = preg_replace( '/%(?=2[346bcfBCF]|3[abdfABDF]|40)/', '%25', $s ); |
177 | return self::decodeURIComponent( $s ); |
178 | } |
179 | |
180 | /** |
181 | * Percent-decode only valid UTF-8 characters, leaving other encoded bytes alone. |
182 | * |
183 | * @param string $s URI to be decoded |
184 | * @return string |
185 | */ |
186 | public static function decodeURIComponent( string $s ): string { |
187 | // Most of the time we should have valid input |
188 | $ret = rawurldecode( $s ); |
189 | if ( mb_check_encoding( $ret, 'UTF-8' ) ) { |
190 | return $ret; |
191 | } |
192 | |
193 | // Extract each encoded character and decode it individually |
194 | return preg_replace_callback( |
195 | // phpcs:ignore Generic.Files.LineLength.TooLong |
196 | '/%[0-7][0-9A-F]|%[CD][0-9A-F]%[89AB][0-9A-F]|%E[0-9A-F](?:%[89AB][0-9A-F]){2}|%F[0-4](?:%[89AB][0-9A-F]){3}/i', |
197 | static function ( $match ) { |
198 | $ret = rawurldecode( $match[0] ); |
199 | return mb_check_encoding( $ret, 'UTF-8' ) ? $ret : $match[0]; |
200 | }, $s |
201 | ); |
202 | } |
203 | |
204 | /** |
205 | * Extract extension source from the token |
206 | * |
207 | * @param Token $token token |
208 | * @return string |
209 | */ |
210 | public static function extractExtBody( Token $token ): string { |
211 | $src = $token->getAttributeV( 'source' ); |
212 | $extTagOffsets = $token->dataParsoid->extTagOffsets; |
213 | '@phan-var \Wikimedia\Parsoid\Core\DomSourceRange $extTagOffsets'; |
214 | return $extTagOffsets->stripTags( $src ); |
215 | } |
216 | |
217 | /** |
218 | * Helper function checks numeric values |
219 | * |
220 | * @param ?int $n checks parameters for numeric type and value zero or positive |
221 | * @return bool |
222 | */ |
223 | private static function isValidOffset( ?int $n ): bool { |
224 | return $n !== null && $n >= 0; |
225 | } |
226 | |
227 | /** |
228 | * Basic check if a DOM Source Range (DSR) is valid. |
229 | * |
230 | * Clarifications about the "basic validity checks": |
231 | * - Only checks for underflow, not for overflow. |
232 | * - Does not verify that start <= end |
233 | * - Does not verify that openWidth + endWidth <= end - start |
234 | * (even so, the values might be invalid because of content) |
235 | * These would be overkill for our purposes. Given how DSR computation |
236 | * works in thie codebase, the real scenarios we care about are |
237 | * non-null / non-negative values since that can happen. |
238 | * |
239 | * @param ?DomSourceRange $dsr DSR source range values |
240 | * @param bool $all Also check the widths of the container tag |
241 | * @return bool |
242 | */ |
243 | public static function isValidDSR( |
244 | ?DomSourceRange $dsr, bool $all = false |
245 | ): bool { |
246 | return $dsr !== null && |
247 | self::isValidOffset( $dsr->start ) && |
248 | self::isValidOffset( $dsr->end ) && |
249 | ( !$all || ( |
250 | self::isValidOffset( $dsr->openWidth ) && |
251 | self::isValidOffset( $dsr->closeWidth ) |
252 | ) |
253 | ); |
254 | } |
255 | |
256 | /** |
257 | * Cannonicalizes a namespace name. |
258 | * |
259 | * @param string $name Non-normalized namespace name. |
260 | * @return string |
261 | */ |
262 | public static function normalizeNamespaceName( string $name ): string { |
263 | return strtr( mb_strtolower( $name ), ' ', '_' ); |
264 | } |
265 | |
266 | /** |
267 | * Decode HTML5 entities in wikitext. |
268 | * |
269 | * NOTE that wikitext only allows semicolon-terminated entities, while |
270 | * HTML allows a number of "legacy" entities to be decoded without |
271 | * a terminating semicolon. This function deliberately does not |
272 | * decode these HTML-only entity forms. |
273 | * |
274 | * @param string $text |
275 | * @return string |
276 | */ |
277 | public static function decodeWtEntities( string $text ): string { |
278 | // Note that HTML5 allows semicolon-less entities which |
279 | // wikitext does not: in wikitext all entities must end in a |
280 | // semicolon. |
281 | // By normalizing before decoding, this routine deliberately |
282 | // does not decode entity references which are invalid in wikitext |
283 | // (mostly because they decode to invalid codepoints). |
284 | return Sanitizer::decodeCharReferences( |
285 | Sanitizer::normalizeCharReferences( $text ) |
286 | ); |
287 | } |
288 | |
289 | /** |
290 | * Entity-escape anything that would decode to a valid wikitext entity. |
291 | * |
292 | * Note that HTML5 allows certain "semicolon-less" entities, like |
293 | * `¶`; these aren't allowed in wikitext and won't be escaped |
294 | * by this function. |
295 | * |
296 | * @param string $text |
297 | * @return string |
298 | */ |
299 | public static function escapeWtEntities( string $text ): string { |
300 | // We just want to encode ampersands that precede valid entities. |
301 | // (And note that semicolon-less entities aren't valid wikitext.) |
302 | return preg_replace_callback( '/&[#0-9a-zA-Z\x80-\xff]+;/', function ( $match ) { |
303 | $m = $match[0]; |
304 | $decodedChar = self::decodeWtEntities( $m ); |
305 | if ( $decodedChar !== $m ) { |
306 | // Escape the ampersand |
307 | return '&' . substr( $m, 1 ); |
308 | } else { |
309 | // Not an entity, just return the string |
310 | return $m; |
311 | } |
312 | }, $text ); |
313 | } |
314 | |
315 | /** |
316 | * Convert special characters to HTML entities |
317 | * |
318 | * @param string $s |
319 | * @return string |
320 | */ |
321 | public static function escapeHtml( string $s ): string { |
322 | // Only encodes five characters: " ' & < > |
323 | return htmlspecialchars( $s, ENT_QUOTES | ENT_HTML5 ); |
324 | } |
325 | |
326 | /** |
327 | * Encode all characters as entity references. This is done to make |
328 | * characters safe for wikitext (regardless of whether they are |
329 | * HTML-safe). Typically only called with single-codepoint strings. |
330 | * @param string $s |
331 | * @return string |
332 | */ |
333 | public static function entityEncodeAll( string $s ): string { |
334 | // This is Unicode aware. |
335 | static $conventions = [ |
336 | // We always use at least two characters for the hex code |
337 | '�' => '�', '' => '', '' => '', '' => '', |
338 | '' => '', '' => '', '' => '', '' => '', |
339 | '' => '', '	' => '	', '
' => '
', '' => '', |
340 | '' => '', '
' => '
', '' => '', '' => '', |
341 | // By convention we use where possible |
342 | ' ' => ' ', |
343 | ]; |
344 | |
345 | return strtr( mb_encode_numericentity( |
346 | $s, [ 0, 0x10ffff, 0, ~0 ], 'utf-8', true |
347 | ), $conventions ); |
348 | } |
349 | |
350 | /** |
351 | * Determine whether the protocol of a link is potentially valid. Use the |
352 | * environment's per-wiki config to do so. |
353 | * |
354 | * @param mixed $linkTarget |
355 | * @param Env $env |
356 | * @return bool |
357 | */ |
358 | public static function isProtocolValid( $linkTarget, Env $env ): bool { |
359 | $siteConf = $env->getSiteConfig(); |
360 | if ( is_string( $linkTarget ) ) { |
361 | return $siteConf->hasValidProtocol( $linkTarget ); |
362 | } else { |
363 | return true; |
364 | } |
365 | } |
366 | |
367 | /** |
368 | * Get argument information for an extension tag token. |
369 | * |
370 | * @param Token $extToken |
371 | * @return DataMw |
372 | */ |
373 | public static function getExtArgInfo( Token $extToken ): DataMw { |
374 | $name = $extToken->getAttributeV( 'name' ); |
375 | $options = $extToken->getAttributeV( 'options' ); |
376 | $defaultDataMw = new DataMw( [ |
377 | 'name' => $name, |
378 | 'attrs' => (object)TokenUtils::kvToHash( $options ), |
379 | ] ); |
380 | $extTagOffsets = $extToken->dataParsoid->extTagOffsets; |
381 | if ( $extTagOffsets->closeWidth !== 0 ) { |
382 | // If not self-closing... |
383 | $defaultDataMw->body = (object)[ |
384 | 'extsrc' => self::extractExtBody( $extToken ), |
385 | ]; |
386 | } |
387 | return $defaultDataMw; |
388 | } |
389 | |
390 | /** |
391 | * Parse media dimensions |
392 | * |
393 | * @param string $str media dimension string to parse |
394 | * @param bool $onlyOne If set, returns null if multiple dimenstions are present |
395 | * @return ?array{x:int,y?:int,bogusPx:bool} |
396 | */ |
397 | public static function parseMediaDimensions( |
398 | string $str, bool $onlyOne = false |
399 | ): ?array { |
400 | $dimensions = null; |
401 | // We support a trailing 'px' here for historical reasons |
402 | // (T15500, T53628, T207032) |
403 | if ( preg_match( '/^(\d*)(?:x(\d+))?\s*(px\s*)?$/D', $str, $match ) ) { |
404 | $dimensions = [ 'x' => null, 'y' => null, 'bogusPx' => false ]; |
405 | if ( !empty( $match[1] ) ) { |
406 | $dimensions['x'] = intval( $match[1], 10 ); |
407 | } |
408 | if ( !empty( $match[2] ) ) { |
409 | if ( $onlyOne ) { |
410 | return null; |
411 | } |
412 | $dimensions['y'] = intval( $match[2], 10 ); |
413 | } |
414 | if ( !empty( $match[3] ) ) { |
415 | $dimensions['bogusPx'] = true; |
416 | } |
417 | } |
418 | return $dimensions; |
419 | } |
420 | |
421 | /** |
422 | * Validate media parameters |
423 | * More generally, this is defined by the media handler in core |
424 | * |
425 | * @param ?int $num |
426 | * @return bool |
427 | */ |
428 | public static function validateMediaParam( ?int $num ): bool { |
429 | return $num !== null && $num > 0; |
430 | } |
431 | |
432 | /** |
433 | * FIXME: Is this needed?? |
434 | * |
435 | * Extract content in a backwards compatible way |
436 | * |
437 | * @param object $revision |
438 | * @return object |
439 | */ |
440 | public static function getStar( $revision ) { |
441 | // @phan-suppress-previous-line PhanPluginNeverReturnMethod |
442 | /* |
443 | $content = $revision; |
444 | if ( $revision && isset( $revision->slots ) ) { |
445 | $content = $revision->slots->main; |
446 | } |
447 | return $content; |
448 | */ |
449 | throw new \BadMethodCallException( "This method shouldn't be needed. " . |
450 | "But, port this if you really need it." ); |
451 | } |
452 | |
453 | /** |
454 | * This regex was generated by running through *all unicode characters* and |
455 | * testing them against *all regexes* for linktrails in a default MW install. |
456 | * We had to treat it a little bit, here's what we changed: |
457 | * |
458 | * 1. A-Z, though allowed in Walloon, is disallowed. |
459 | * 2. '"', though allowed in Chuvash, is disallowed. |
460 | * 3. '-', though allowed in Icelandic (possibly due to a bug), is disallowed. |
461 | * 4. '1', though allowed in Lak (possibly due to a bug), is disallowed. |
462 | */ |
463 | // phpcs:disable Generic.Files.LineLength.TooLong |
464 | public static $linkTrailRegex = |
465 | '/^[^\0-`{÷ĀĈ-ČĎĐĒĔĖĚĜĝĠ-ĪĬ-įIJĴ-ĹĻ-ĽĿŀŅņʼnŊŌŎŏŒŔŖ-ŘŜŝŠŤŦŨŪ-ŬŮŲ-ŴŶŸ' . |
466 | 'ſ-ǤǦǨǪ-Ǯǰ-ȗȜ-ȞȠ-ɘɚ-ʑʓ-ʸʽ-̂̄-΅·Ϗ-ЯѐѝѠѢѤѦѨѪѬѮѰѲѴѶѸѺ-ѾҀ-҃҅-ҐҒҔҕҘҚҜ-ҠҤ-ҪҬҭҰҲ' . |
467 | 'Ҵ-ҶҸҹҼ-ҿӁ-ӗӚ-ӜӞӠ-ӢӤӦӪ-ӲӴӶ-ՠֈ--ؠً-ٳٵ-ٽٿ-څڇ-ڗڙ-ڨڪ-ڬڮڰ-ڽڿ-ۅۈ-ۊۍ-۔ۖ--' . |
468 | '---੯ੴ-ჱ-ẼẾ-\x{200b}\x{200d}-‒—-‗‚‛”--\x{fffd}]+$/D'; |
469 | // phpcs:enable Generic.Files.LineLength.TooLong |
470 | |
471 | /** |
472 | * Check whether some text is a valid link trail. |
473 | * |
474 | * @param string $text |
475 | * @return bool |
476 | */ |
477 | public static function isLinkTrail( string $text ): bool { |
478 | return $text !== '' && preg_match( self::$linkTrailRegex, $text ); |
479 | } |
480 | |
481 | /** |
482 | * Convert BCP-47-compliant language code to MediaWiki-internal code. |
483 | * |
484 | * This is a temporary back-compatibility hack; Parsoid should be |
485 | * using BCP 47 strings or Bcp47Code objects in all its external APIs. |
486 | * Try to avoid using it, though: there's no guarantee |
487 | * that this mapping will remain in sync with upstream. |
488 | * |
489 | * @param string|Bcp47Code $code BCP-47 language code |
490 | * @return string MediaWiki-internal language code |
491 | */ |
492 | public static function bcp47ToMwCode( $code ): string { |
493 | // This map is dumped from |
494 | // LanguageCode::NON_STANDARD_LANGUAGE_CODE_MAPPING in core, but |
495 | // with keys and values swapped and BCP-47 codes lowercased: |
496 | // |
497 | // array_flip(array_map(strtolower, |
498 | // LanguageCode::NON_STANDARD_LANGUAGE_CODE_MAPPING)) |
499 | // |
500 | // Hopefully we will be able to deprecate and remove this from |
501 | // Parsoid quickly enough that keeping it in sync with upstream |
502 | // is not an issue. |
503 | static $MAP = [ |
504 | "cbk" => "cbk-zam", |
505 | "de-x-formal" => "de-formal", |
506 | "egl" => "eml", |
507 | "en-x-rtl" => "en-rtl", |
508 | "es-x-formal" => "es-formal", |
509 | "hu-x-formal" => "hu-formal", |
510 | "jv-x-bms" => "map-bms", |
511 | "ro-cyrl-md" => "mo", |
512 | "nrf" => "nrm", |
513 | "nl-x-informal" => "nl-informal", |
514 | "nap-x-tara" => "roa-tara", |
515 | "en-simple" => "simple", |
516 | "sr-cyrl" => "sr-ec", |
517 | "sr-latn" => "sr-el", |
518 | "zh-hans-cn" => "zh-cn", |
519 | "zh-hans-sg" => "zh-sg", |
520 | "zh-hans-my" => "zh-my", |
521 | "zh-hant-tw" => "zh-tw", |
522 | "zh-hant-hk" => "zh-hk", |
523 | "zh-hant-mo" => "zh-mo", |
524 | ]; |
525 | if ( $code instanceof Bcp47Code ) { |
526 | $code = $code->toBcp47Code(); |
527 | } |
528 | $code = strtolower( $code ); // All MW-internal codes are lowercase |
529 | return $MAP[$code] ?? $code; |
530 | } |
531 | |
532 | /** |
533 | * Convert MediaWiki-internal language code to a BCP-47-compliant |
534 | * language code suitable for including in HTML. |
535 | * |
536 | * This is a temporary back-compatibility hack, needed for compatibility |
537 | * when running in standalone mode with MediaWiki Action APIs which expose |
538 | * internal language codes. These APIs should eventually be improved |
539 | * so that they also expose BCP-47 compliant codes, which can then be |
540 | * used directly by Parsoid without conversion. But until that day |
541 | * comes, this function will paper over the differences. |
542 | * |
543 | * Note that MediaWiki-internal Language objects implement Bcp47Code, |
544 | * so we can transition interfaces which currently take a string code |
545 | * to pass a Language object instead; that will make this method |
546 | * effectively a no-op and avoid the issue of upstream sync of the |
547 | * mapping table. |
548 | * |
549 | * @param string|Bcp47Code $code MediaWiki-internal language code or object |
550 | * @param bool $strict If true, this code will log a deprecation message |
551 | * or fail if a MediaWiki-internal language code is passed. |
552 | * @param ?LoggerInterface $warnLogger A deprecation warning will be |
553 | * emitted on $warnLogger if $strict is true and a string-valued |
554 | * MediaWiki-internal language code is passed; otherwise an exception |
555 | * will be thrown. |
556 | * @return Bcp47Code BCP-47 language code. |
557 | * @see LanguageCode::bcp47() |
558 | */ |
559 | public static function mwCodeToBcp47( |
560 | $code, bool $strict = false, ?LoggerInterface $warnLogger = null |
561 | ): Bcp47Code { |
562 | if ( $code instanceof Bcp47Code ) { |
563 | return $code; |
564 | } |
565 | if ( $strict ) { |
566 | $msg = "Use of string-valued BCP-47 codes is deprecated."; |
567 | if ( defined( 'MW_PHPUNIT_TEST' ) || defined( 'MW_PARSER_TEST' ) ) { |
568 | // Always throw an error if running tests |
569 | throw new \Error( $msg ); |
570 | } |
571 | if ( $warnLogger ) { |
572 | $warnLogger->warning( $msg ); |
573 | } else { |
574 | // Strict mode requested but no deprecation logger provided |
575 | throw new \Error( $msg ); |
576 | } |
577 | } |
578 | // This map is dumped from |
579 | // LanguageCode::getNonstandardLanguageCodeMapping() in core. |
580 | // Hopefully we will be able to deprecate and remove this method |
581 | // from Parsoid quickly enough that keeping it in sync with upstream |
582 | // will not be an issue. |
583 | static $MAP = [ |
584 | "als" => "gsw", |
585 | "bat-smg" => "sgs", |
586 | "be-x-old" => "be-tarask", |
587 | "fiu-vro" => "vro", |
588 | "roa-rup" => "rup", |
589 | "zh-classical" => "lzh", |
590 | "zh-min-nan" => "nan", |
591 | "zh-yue" => "yue", |
592 | "cbk-zam" => "cbk", |
593 | "de-formal" => "de-x-formal", |
594 | "eml" => "egl", |
595 | "en-rtl" => "en-x-rtl", |
596 | "es-formal" => "es-x-formal", |
597 | "hu-formal" => "hu-x-formal", |
598 | "map-bms" => "jv-x-bms", |
599 | "mo" => "ro-Cyrl-MD", |
600 | "nrm" => "nrf", |
601 | "nl-informal" => "nl-x-informal", |
602 | "roa-tara" => "nap-x-tara", |
603 | "simple" => "en-simple", |
604 | "sr-ec" => "sr-Cyrl", |
605 | "sr-el" => "sr-Latn", |
606 | "zh-cn" => "zh-Hans-CN", |
607 | "zh-sg" => "zh-Hans-SG", |
608 | "zh-my" => "zh-Hans-MY", |
609 | "zh-tw" => "zh-Hant-TW", |
610 | "zh-hk" => "zh-Hant-HK", |
611 | "zh-mo" => "zh-Hant-MO", |
612 | ]; |
613 | $code = $MAP[$code] ?? $code; |
614 | // The rest of this code is copied verbatim from LanguageCode::bcp47() |
615 | // in core. |
616 | $codeSegment = explode( '-', $code ); |
617 | $codeBCP = []; |
618 | foreach ( $codeSegment as $segNo => $seg ) { |
619 | // when previous segment is x, it is a private segment and should be lc |
620 | if ( $segNo > 0 && strtolower( $codeSegment[( $segNo - 1 )] ) == 'x' ) { |
621 | $codeBCP[$segNo] = strtolower( $seg ); |
622 | // ISO 3166 country code |
623 | } elseif ( ( strlen( $seg ) == 2 ) && ( $segNo > 0 ) ) { |
624 | $codeBCP[$segNo] = strtoupper( $seg ); |
625 | // ISO 15924 script code |
626 | } elseif ( ( strlen( $seg ) == 4 ) && ( $segNo > 0 ) ) { |
627 | $codeBCP[$segNo] = ucfirst( strtolower( $seg ) ); |
628 | // Use lowercase for other cases |
629 | } else { |
630 | $codeBCP[$segNo] = strtolower( $seg ); |
631 | } |
632 | } |
633 | return new Bcp47CodeValue( implode( '-', $codeBCP ) ); |
634 | } |
635 | |
636 | /** |
637 | * BCP 47 codes are case-insensitive, so this helper does a "proper" |
638 | * comparison of Bcp47Code objects. |
639 | * @param Bcp47Code $a |
640 | * @param Bcp47Code $b |
641 | * @return bool true iff $a and $b represent the same language |
642 | */ |
643 | public static function isBcp47CodeEqual( Bcp47Code $a, Bcp47Code $b ): bool { |
644 | return strcasecmp( $a->toBcp47Code(), $b->toBcp47Code() ) === 0; |
645 | } |
646 | } |