Code Coverage for /src/src/Utils/Utils.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	13.58% covered (danger)	13.58%	33 / 243	21.43% covered (danger)	21.43%	6 / 28	CRAP	0.00% covered (danger)	0.00%	0 / 1
Utils	13.58% covered (danger)	13.58%	33 / 243	21.43% covered (danger)	21.43%	6 / 28	3903.67	0.00% covered (danger)	0.00%	0 / 1
stripParsoidIdPrefix	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
stripNamespace	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
isParsoidObjectId	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
isVoidElement	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
recursiveClone	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
clone	0.00% covered (danger)	0.00%	0 / 15	0.00% covered (danger)	0.00%	0 / 1	56
lastUniChar	0.00% covered (danger)	0.00%	0 / 8	0.00% covered (danger)	0.00%	0 / 1	30
isUniWord	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
phpURLEncode	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
decodeURI	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
decodeURIComponent	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	3
extractExtBody	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	2
isValidOffset	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	6
isValidDSR	0.00% covered (danger)	0.00%	0 / 8	0.00% covered (danger)	0.00%	0 / 1	42
normalizeNamespaceName	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
decodeWtEntities	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
escapeWtEntities	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	2
escapeWt	0.00% covered (danger)	0.00%	0 / 53	0.00% covered (danger)	0.00%	0 / 1	42
escapeHtml	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
entityEncodeAll	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	1
isProtocolValid	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	6
getExtArgInfo	0.00% covered (danger)	0.00%	0 / 12	0.00% covered (danger)	0.00%	0 / 1	6
parseMediaDimensions	0.00% covered (danger)	0.00%	0 / 18	0.00% covered (danger)	0.00%	0 / 1	90
validateMediaParam	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	6
isLinkTrail	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	6
bcp47ToMwCode	0.00% covered (danger)	0.00%	0 / 26	0.00% covered (danger)	0.00%	0 / 1	6
mwCodeToBcp47	0.00% covered (danger)	0.00%	0 / 51	0.00% covered (danger)	0.00%	0 / 1	182
isBcp47CodeEqual	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2

1	<?php
2	declare( strict_types = 1 );
3
4	namespace Wikimedia\Parsoid\Utils;
5
6	use Psr\Log\LoggerInterface;
7	use Wikimedia\Bcp47Code\Bcp47Code;
8	use Wikimedia\Bcp47Code\Bcp47CodeValue;
9	use Wikimedia\Parsoid\Config\Env;
10	use Wikimedia\Parsoid\Config\SiteConfig;
11	use Wikimedia\Parsoid\Core\DomSourceRange;
12	use Wikimedia\Parsoid\Core\Sanitizer;
13	use Wikimedia\Parsoid\NodeData\DataMw;
14	use Wikimedia\Parsoid\Tokens\Token;
15	use Wikimedia\Parsoid\Wikitext\Consts;
16
17	/**
18	* This file contains general utilities for token transforms.
19	*/
20	class Utils {
21	/**
22	* Regular expression fragment for matching wikitext comments.
23	* Meant for inclusion in other regular expressions.
24	*/
25	// Maintenance note: this is used in /x regexes so all whitespace and # should be escaped
26	public const COMMENT_REGEXP_FRAGMENT = '<!--(?>[\s\S]*?-->)';
27	/** Regular fragment for matching a wikitext comment */
28	public const COMMENT_REGEXP = '/' . self::COMMENT_REGEXP_FRAGMENT . '/';
29
30	/**
31	* Strip Parsoid id prefix from aboutID
32	*
33	* @param string $aboutId aboud ID string
34	* @return string
35	*/
36	public static function stripParsoidIdPrefix( string $aboutId ): string {
37	// 'mwt' is the prefix used for new ids
38	return preg_replace( '/^#?mwt/', '', $aboutId );
39	}
40
41	/**
42	* Strip PHP namespace from the fully qualified class name
43	* @param string $className
44	* @return string
45	*/
46	public static function stripNamespace( string $className ): string {
47	return preg_replace( '/.*\\\\/', '', $className );
48	}
49
50	/**
51	* Check for Parsoid id prefix in an aboutID string
52	*
53	* @param string $aboutId aboud ID string
54	* @return bool
55	*/
56	public static function isParsoidObjectId( string $aboutId ): bool {
57	// 'mwt' is the prefix used for new ids
58	return str_starts_with( $aboutId, '#mwt' );
59	}
60
61	/**
62	* Determine if the named tag is void (can not have content).
63	*
64	* @param string $name tag name
65	* @return bool
66	*/
67	public static function isVoidElement( string $name ): bool {
68	return isset( Consts::$HTML['VoidTags'][$name] );
69	}
70
71	/**
72	* recursive deep clones helper function
73	*
74	* @param object $el object
75	* @return object
76	*/
77	private static function recursiveClone( $el ) {
78	return self::clone( $el, true );
79	}
80
81	/**
82	* Deep clones by default.
83	* @param object\|array $obj arrays or plain objects
84	* Tokens or DOM nodes shouldn't be passed in.
85	*
86	* CAVEAT: It looks like debugging methods pass in arrays
87	* that can have DOM nodes. So, for debugging purposes,
88	* we handle top-level DOM nodes or DOM nodes embedded in arrays
89	* But, this will miserably fail if an object embeds a DOM node.
90	*
91	* @param bool $deepClone
92	* @param bool $debug
93	* @return object\|array
94	*/
95	public static function clone( $obj, $deepClone = true, $debug = false ) {
96	if ( $debug ) {
97	if ( $obj instanceof \DOMNode ) {
98	return $obj->cloneNode( $deepClone );
99	}
100	if ( is_array( $obj ) ) {
101	if ( $deepClone ) {
102	return array_map(
103	static function ( $o ) {
104	return Utils::clone( $o, true, true );
105	},
106	$obj
107	);
108	} else {
109	return $obj; // Copy-on-write cloning
110	}
111	}
112	}
113
114	if ( !$deepClone && is_object( $obj ) ) {
115	return clone $obj;
116	}
117
118	// FIXME, see T161647
119	// This will fail if $obj is (or embeds) a DOMNode
120	return unserialize( serialize( $obj ) );
121	}
122
123	/**
124	* Extract the last unicode character of the string.
125	* This might be more than one byte, if the last character
126	* is non-ASCII.
127	* @param string $str
128	* @param ?int $idx The index after the character to extract; defaults
129	* to the length of $str, which will extract the last character in
130	* $str.
131	* @return string
132	*/
133	public static function lastUniChar( string $str, ?int $idx = null ): string {
134	if ( $idx === null ) {
135	$idx = strlen( $str );
136	} elseif ( $idx <= 0 \|\| $idx > strlen( $str ) ) {
137	return '';
138	}
139	$c = $str[--$idx];
140	while ( ( ord( $c ) & 0xC0 ) === 0x80 ) {
141	$c = $str[--$idx] . $c;
142	}
143	return $c;
144	}
145
146	/**
147	* Return true if the first character in $s is a unicode word character.
148	* @param string $s
149	* @return bool
150	*/
151	public static function isUniWord( string $s ): bool {
152	return preg_match( '#^\w#u', $s ) === 1;
153	}
154
155	/**
156	* This should not be used.
157	* @param string $txt URL to encode using PHP encoding
158	* @return string
159	*/
160	public static function phpURLEncode( $txt ) {
161	// @phan-suppress-previous-line PhanPluginNeverReturnMethod
162	throw new \BadMethodCallException( 'Use urlencode( $txt ) instead' );
163	}
164
165	/**
166	* Percent-decode only valid UTF-8 characters, leaving other encoded bytes alone.
167	*
168	* Distinct from `decodeURIComponent` in that certain escapes are not decoded,
169	* matching the behavior of JavaScript's decodeURI().
170	*
171	* @see https://www.ecma-international.org/ecma-262/6.0/#sec-decodeuri-encodeduri
172	* @param string $s URI to be decoded
173	* @return string
174	*/
175	public static function decodeURI( string $s ): string {
176	// Escape the '%' in sequences for the reserved characters, then use decodeURIComponent.
177	$s = preg_replace( '/%(?=2[346bcfBCF]\|3[abdfABDF]\|40)/', '%25', $s );
178	return self::decodeURIComponent( $s );
179	}
180
181	/**
182	* Percent-decode only valid UTF-8 characters, leaving other encoded bytes alone.
183	*
184	* @param string $s URI to be decoded
185	* @return string
186	*/
187	public static function decodeURIComponent( string $s ): string {
188	// Most of the time we should have valid input
189	$ret = rawurldecode( $s );
190	if ( mb_check_encoding( $ret, 'UTF-8' ) ) {
191	return $ret;
192	}
193
194	// Extract each encoded character and decode it individually
195	return preg_replace_callback(
196	// phpcs:ignore Generic.Files.LineLength.TooLong
197	'/%[0-7][0-9A-F]\|%[CD][0-9A-F]%[89AB][0-9A-F]\|%E[0-9A-F](?:%[89AB][0-9A-F]){2}\|%F[0-4](?:%[89AB][0-9A-F]){3}/i',
198	static function ( $match ) {
199	$ret = rawurldecode( $match[0] );
200	return mb_check_encoding( $ret, 'UTF-8' ) ? $ret : $match[0];
201	}, $s
202	);
203	}
204
205	/**
206	* Extract extension source from the token
207	*
208	* @param Token $token token
209	* @return string
210	*/
211	public static function extractExtBody( Token $token ): string {
212	$src = $token->getAttributeV( 'source' );
213	$extTagOffsets = $token->dataParsoid->extTagOffsets;
214	'@phan-var \Wikimedia\Parsoid\Core\DomSourceRange $extTagOffsets';
215	return $extTagOffsets->stripTags( $src );
216	}
217
218	/**
219	* Helper function checks numeric values
220	*
221	* @param ?int $n checks parameters for numeric type and value zero or positive
222	* @return bool
223	*/
224	private static function isValidOffset( ?int $n ): bool {
225	return $n !== null && $n >= 0;
226	}
227
228	/**
229	* Basic check if a DOM Source Range (DSR) is valid.
230	*
231	* Clarifications about the "basic validity checks":
232	* - Only checks for underflow, not for overflow.
233	* - Does not verify that start <= end
234	* - Does not verify that openWidth + endWidth <= end - start
235	* (even so, the values might be invalid because of content)
236	* These would be overkill for our purposes. Given how DSR computation
237	* works in thie codebase, the real scenarios we care about are
238	* non-null / non-negative values since that can happen.
239	*
240	* @param ?DomSourceRange $dsr DSR source range values
241	* @param bool $all Also check the widths of the container tag
242	* @return bool
243	*/
244	public static function isValidDSR(
245	?DomSourceRange $dsr, bool $all = false
246	): bool {
247	return $dsr !== null &&
248	self::isValidOffset( $dsr->start ) &&
249	self::isValidOffset( $dsr->end ) &&
250	( !$all \|\| (
251	self::isValidOffset( $dsr->openWidth ) &&
252	self::isValidOffset( $dsr->closeWidth )
253	)
254	);
255	}
256
257	/**
258	* Cannonicalizes a namespace name.
259	*
260	* @param string $name Non-normalized namespace name.
261	* @return string
262	*/
263	public static function normalizeNamespaceName( string $name ): string {
264	return strtr( mb_strtolower( $name ), ' ', '_' );
265	}
266
267	/**
268	* Decode HTML5 entities in wikitext.
269	*
270	* NOTE that wikitext only allows semicolon-terminated entities, while
271	* HTML allows a number of "legacy" entities to be decoded without
272	* a terminating semicolon. This function deliberately does not
273	* decode these HTML-only entity forms.
274	*
275	* @param string $text
276	* @return string
277	*/
278	public static function decodeWtEntities( string $text ): string {
279	// Note that HTML5 allows semicolon-less entities which
280	// wikitext does not: in wikitext all entities must end in a
281	// semicolon.
282	// By normalizing before decoding, this routine deliberately
283	// does not decode entity references which are invalid in wikitext
284	// (mostly because they decode to invalid codepoints).
285	return Sanitizer::decodeCharReferences(
286	Sanitizer::normalizeCharReferences( $text )
287	);
288	}
289
290	/**
291	* Entity-escape anything that would decode to a valid wikitext entity.
292	*
293	* Note that HTML5 allows certain "semicolon-less" entities, like
294	* `&para`; these aren't allowed in wikitext and won't be escaped
295	* by this function.
296	*
297	* @param string $text
298	* @return string
299	*/
300	public static function escapeWtEntities( string $text ): string {
301	// We just want to encode ampersands that precede valid entities.
302	// (And note that semicolon-less entities aren't valid wikitext.)
303	return preg_replace_callback( '/&[#0-9a-zA-Z\x80-\xff]+;/', function ( $match ) {
304	$m = $match[0];
305	$decodedChar = self::decodeWtEntities( $m );
306	if ( $decodedChar !== $m ) {
307	// Escape the ampersand
308	return '&' . substr( $m, 1 );
309	} else {
310	// Not an entity, just return the string
311	return $m;
312	}
313	}, $text );
314	}
315
316	/**
317	* Ensure that the given literal string is safe to parse as wikitext.
318	* See wfEscapeWikiText() in core.
319	*/
320	public static function escapeWt( string $input ): string {
321	static $repl = null, $repl2 = null, $repl3 = null, $repl4 = null;
322	if ( $repl === null ) {
323	$repl = [
324	'"' => '"', '&' => '&', "'" => ''', '<' => '<',
325	'=' => '=', '>' => '>', '[' => '[', ']' => ']',
326	'{' => '{', '\|' => '\|', '}' => '}',
327	';' => ';', // a token inside language converter brackets
328	'!!' => '!!', // a token inside table context
329	"\n!" => "\n!", "\r!" => "\r!", // a token inside table context
330	"\n#" => "\n#", "\r#" => "\r#",
331	"\n" => "\n", "\r" => "\r",
332	"\n:" => "\n:", "\r:" => "\r:",
333	"\n " => "\n ", "\r " => "\r ",
334	"\n\n" => "\n ", "\r\n" => " \n",
335	"\n\r" => "\n ", "\r\r" => "\r ",
336	"\n\t" => "\n ", "\r\t" => "\r ", // "\n\t\n" is treated like "\n\n"
337	"\n----" => "\n----", "\r----" => "\r----",
338	'__' => '__', '://' => '://',
339	'~~~' => '~~~', // protect from PST, just to be safe(r)
340	];
341
342	$magicLinks = [ 'ISBN', 'PMID', 'RFC' ];
343	// We have to catch everything "\s" matches in PCRE
344	foreach ( $magicLinks as $magic ) {
345	$repl["$magic "] = "$magic ";
346	$repl["$magic\t"] = "$magic ";
347	$repl["$magic\r"] = "$magic ";
348	$repl["$magic\n"] = "$magic ";
349	$repl["$magic\f"] = "$magic";
350	}
351	// Additionally escape the following characters at the beginning of the
352	// string, in case they merge to form tokens when spliced into a
353	// string. Tokens like -{ {{ [[ {\| etc are already escaped because
354	// the second character is escaped above, but the following tokens
355	// are handled here: \|+ \|- __FOO__ ~~~
356	$repl3 = [
357	'+' => '+', '-' => '-', '_' => '_', '~' => '~',
358	];
359	// Similarly, protect the following characters at the end of the
360	// string, which could turn form the start of `__FOO__` or `~~~~`
361	// A trailing newline could also form the unintended start of a
362	// paragraph break if it is glued to a newline in the following
363	// context.
364	$repl4 = [
365	'_' => '_', '~' => '~',
366	"\n" => " ", "\r" => " ",
367	"\t" => " ", // "\n\t\n" is treated like "\n\n"
368	];
369
370	// And handle protocols that don't use "://"
371	$urlProtocols = [
372	'bitcoin:', 'geo:', 'magnet:', 'mailto:', 'matrix:', 'news:',
373	'sip:', 'sips:', 'sms:', 'tel:', 'urn:', 'xmpp:',
374	];
375	$repl2 = [];
376	foreach ( $urlProtocols as $prot ) {
377	$repl2[] = preg_quote( substr( $prot, 0, -1 ), '/' );
378	}
379	$repl2 = '/\b(' . implode( '\|', $repl2 ) . '):/i';
380	}
381	// Tell phan that $repl2, $repl3 and $repl4 will also be non-null here
382	'@phan-var string $repl2';
383	'@phan-var string $repl3';
384	'@phan-var string $repl4';
385	// This will also stringify input in case it's not a string
386	$text = substr( strtr( "\n$input", $repl ), 1 );
387	if ( $text === '' ) {
388	return $text;
389	}
390	$first = strtr( $text[0], $repl3 ); // protect first character
391	if ( strlen( $text ) > 1 ) {
392	$text = $first . substr( $text, 1, -1 ) .
393	strtr( substr( $text, -1 ), $repl4 ); // protect last character
394	} else {
395	// special case for single-character strings
396	$text = strtr( $first, $repl4 ); // protect last character
397	}
398	$text = preg_replace( $repl2, '$1:', $text );
399	return $text;
400	}
401
402	/**
403	* Convert special characters to HTML entities
404	*
405	* @param string $s
406	* @return string
407	*/
408	public static function escapeHtml( string $s ): string {
409	// Only encodes five characters: " ' & < >
410	return htmlspecialchars( $s, ENT_QUOTES \| ENT_HTML5 );
411	}
412
413	/**
414	* Encode all characters as entity references. This is done to make
415	* characters safe for wikitext (regardless of whether they are
416	* HTML-safe). Typically only called with single-codepoint strings.
417	* @param string $s
418	* @return string
419	*/
420	public static function entityEncodeAll( string $s ): string {
421	// This is Unicode aware.
422	static $conventions = [
423	// We always use at least two characters for the hex code
424	'' => '', '' => '', '' => '', '' => '',
425	'' => '', '' => '', '' => '', '' => '',
426	'' => '', ' ' => ' ', ' ' => ' ', '' => '',
427	'' => '', ' ' => ' ', '' => '', '' => '',
428	// By convention we use   where possible
429	' ' => ' ',
430	];
431
432	return strtr( mb_encode_numericentity(
433	$s, [ 0, 0x10ffff, 0, ~0 ], 'utf-8', true
434	), $conventions );
435	}
436
437	/**
438	* Determine whether the protocol of a link is potentially valid. Use the
439	* environment's per-wiki config to do so.
440	*
441	* @param mixed $linkTarget
442	* @param Env $env
443	* @return bool
444	*/
445	public static function isProtocolValid( $linkTarget, Env $env ): bool {
446	$siteConf = $env->getSiteConfig();
447	if ( is_string( $linkTarget ) ) {
448	return $siteConf->hasValidProtocol( $linkTarget );
449	} else {
450	return true;
451	}
452	}
453
454	/**
455	* Get argument information for an extension tag token.
456	*
457	* @param Token $extToken
458	* @return DataMw
459	*/
460	public static function getExtArgInfo( Token $extToken ): DataMw {
461	$name = $extToken->getAttributeV( 'name' );
462	$options = $extToken->getAttributeV( 'options' );
463	$defaultDataMw = new DataMw( [
464	'name' => $name,
465	// T367616: 'attrs' should be renamed to 'extAttrs'
466	'attrs' => (object)TokenUtils::kvToHash( $options ),
467	] );
468	$extTagOffsets = $extToken->dataParsoid->extTagOffsets;
469	if ( $extTagOffsets->closeWidth !== 0 ) {
470	// If not self-closing...
471	$defaultDataMw->body = (object)[
472	'extsrc' => self::extractExtBody( $extToken ),
473	];
474	}
475	return $defaultDataMw;
476	}
477
478	/**
479	* Parse media dimensions
480	*
481	* @param SiteConfig $siteConfig
482	* @param string $str media dimension string to parse
483	* @param bool $onlyOne If set, returns null if multiple dimenstions are present
484	* @param bool $localized Defaults to false; set to true if the $str
485	* has already been matched against `img_width` to localize the `px`
486	* suffix.
487	* @return ?array{x:int,y?:int,bogusPx:bool}
488	*/
489	public static function parseMediaDimensions(
490	SiteConfig $siteConfig, string $str, bool $onlyOne = false,
491	bool $localized = false
492	): ?array {
493	if ( !$localized ) {
494	$getOption = $siteConfig->getMediaPrefixParameterizedAliasMatcher();
495	$bits = $getOption( $str );
496	$normalizedBit0 = $bits ? mb_strtolower( trim( $bits['k'] ) ) : null;
497	if ( $normalizedBit0 === 'img_width' ) {
498	$str = $bits['v'];
499	}
500	}
501	$dimensions = null;
502	// We support a trailing 'px' here for historical reasons
503	// (T15500, T53628, T207032)
504	if ( preg_match( '/^(\d)(?:x(\d+))?\s(px\s*)?$/D', $str, $match ) ) {
505	$dimensions = [ 'x' => null, 'y' => null, 'bogusPx' => false ];
506	if ( !empty( $match[1] ) ) {
507	$dimensions['x'] = intval( $match[1], 10 );
508	}
509	if ( !empty( $match[2] ) ) {
510	if ( $onlyOne ) {
511	return null;
512	}
513	$dimensions['y'] = intval( $match[2], 10 );
514	}
515	if ( !empty( $match[3] ) ) {
516	$dimensions['bogusPx'] = true;
517	}
518	}
519	return $dimensions;
520	}
521
522	/**
523	* Validate media parameters
524	* More generally, this is defined by the media handler in core
525	*
526	* @param ?int $num
527	* @return bool
528	*/
529	public static function validateMediaParam( ?int $num ): bool {
530	return $num !== null && $num > 0;
531	}
532
533	/**
534	* This regex was generated by running through all unicode characters and
535	* testing them against all regexes for linktrails in a default MW install.
536	* We had to treat it a little bit, here's what we changed:
537	*
538	* 1. A-Z, though allowed in Walloon, is disallowed.
539	* 2. '"', though allowed in Chuvash, is disallowed.
540	* 3. '-', though allowed in Icelandic (possibly due to a bug), is disallowed.
541	* 4. '1', though allowed in Lak (possibly due to a bug), is disallowed.
542	*/
543	// phpcs:disable Generic.Files.LineLength.TooLong
544	public static $linkTrailRegex =
545	'/^[^\0-`{÷ĀĈ-ČĎĐĒĔĖĚĜĝĠ-ĪĬ-įĲĴ-ĹĻ-ĽĿŀŅņŉŊŌŎŏŒŔŖ-ŘŜŝŠŤŦŨŪ-ŬŮŲ-ŴŶŸ' .
546	'ſ-ǤǦǨǪ-Ǯǰ-ȗȜ-ȞȠ-ɘɚ-ʑʓ-ʸʽ-̂̄-΅·΋΍΢Ϗ-ЯѐѝѠѢѤѦѨѪѬѮѰѲѴѶѸѺ-ѾҀ-҃҅-ҐҒҔҕҘҚҜ-ҠҤ-ҪҬҭҰҲ' .
547	'Ҵ-ҶҸҹҼ-ҿӁ-ӗӚ-ӜӞӠ-ӢӤӦӪ-ӲӴӶ-ՠֈ-׏׫-ؠً-ٳٵ-ٽٿ-څڇ-ڗڙ-ڨڪ-ڬڮڰ-ڽڿ-ۅۈ-ۊۍ-۔ۖ-਀਄਋-਎਑਒' .
548	'਩਱਴਷਺਻਽੃-੆੉੊੎-੘੝੟-੯ੴ-჏ჱ-ẼẾ-\x{200b}\x{200d}-‒—-‗‚‛”--\x{fffd}]+$/D';
549	// phpcs:enable Generic.Files.LineLength.TooLong
550
551	/**
552	* Check whether some text is a valid link trail.
553	*
554	* @param string $text
555	* @return bool
556	*/
557	public static function isLinkTrail( string $text ): bool {
558	return $text !== '' && preg_match( self::$linkTrailRegex, $text );
559	}
560
561	/**
562	* Convert BCP-47-compliant language code to MediaWiki-internal code.
563	*
564	* This is a temporary back-compatibility hack; Parsoid should be
565	* using BCP 47 strings or Bcp47Code objects in all its external APIs.
566	* Try to avoid using it, though: there's no guarantee
567	* that this mapping will remain in sync with upstream.
568	*
569	* @param string\|Bcp47Code $code BCP-47 language code
570	* @return string MediaWiki-internal language code
571	*/
572	public static function bcp47ToMwCode( $code ): string {
573	// This map is dumped from
574	// LanguageCode::NON_STANDARD_LANGUAGE_CODE_MAPPING in core, but
575	// with keys and values swapped and BCP-47 codes lowercased:
576	//
577	// array_flip(array_map(strtolower,
578	// LanguageCode::NON_STANDARD_LANGUAGE_CODE_MAPPING))
579	//
580	// Hopefully we will be able to deprecate and remove this from
581	// Parsoid quickly enough that keeping it in sync with upstream
582	// is not an issue.
583	static $MAP = [
584	"cbk" => "cbk-zam",
585	"de-x-formal" => "de-formal",
586	"egl" => "eml",
587	"en-x-rtl" => "en-rtl",
588	"es-x-formal" => "es-formal",
589	"hu-x-formal" => "hu-formal",
590	"jv-x-bms" => "map-bms",
591	"ro-cyrl-md" => "mo",
592	"nrf" => "nrm",
593	"nl-x-informal" => "nl-informal",
594	"nap-x-tara" => "roa-tara",
595	"en-simple" => "simple",
596	"sr-cyrl" => "sr-ec",
597	"sr-latn" => "sr-el",
598	"zh-hans-cn" => "zh-cn",
599	"zh-hans-sg" => "zh-sg",
600	"zh-hans-my" => "zh-my",
601	"zh-hant-tw" => "zh-tw",
602	"zh-hant-hk" => "zh-hk",
603	"zh-hant-mo" => "zh-mo",
604	];
605	if ( $code instanceof Bcp47Code ) {
606	$code = $code->toBcp47Code();
607	}
608	$code = strtolower( $code ); // All MW-internal codes are lowercase
609	return $MAP[$code] ?? $code;
610	}
611
612	/**
613	* Convert MediaWiki-internal language code to a BCP-47-compliant
614	* language code suitable for including in HTML.
615	*
616	* This is a temporary back-compatibility hack, needed for compatibility
617	* when running in standalone mode with MediaWiki Action APIs which expose
618	* internal language codes. These APIs should eventually be improved
619	* so that they also expose BCP-47 compliant codes, which can then be
620	* used directly by Parsoid without conversion. But until that day
621	* comes, this function will paper over the differences.
622	*
623	* Note that MediaWiki-internal Language objects implement Bcp47Code,
624	* so we can transition interfaces which currently take a string code
625	* to pass a Language object instead; that will make this method
626	* effectively a no-op and avoid the issue of upstream sync of the
627	* mapping table.
628	*
629	* @param string\|Bcp47Code $code MediaWiki-internal language code or object
630	* @param bool $strict If true, this code will log a deprecation message
631	* or fail if a MediaWiki-internal language code is passed.
632	* @param ?LoggerInterface $warnLogger A deprecation warning will be
633	* emitted on $warnLogger if $strict is true and a string-valued
634	* MediaWiki-internal language code is passed; otherwise an exception
635	* will be thrown.
636	* @return Bcp47Code BCP-47 language code.
637	* @see LanguageCode::bcp47()
638	*/
639	public static function mwCodeToBcp47(
640	$code, bool $strict = false, ?LoggerInterface $warnLogger = null
641	): Bcp47Code {
642	if ( $code instanceof Bcp47Code ) {
643	return $code;
644	}
645	if ( $strict ) {
646	$msg = "Use of string-valued BCP-47 codes is deprecated.";
647	if ( defined( 'MW_PHPUNIT_TEST' ) \|\| defined( 'MW_PARSER_TEST' ) ) {
648	// Always throw an error if running tests
649	throw new \Error( $msg );
650	}
651	if ( $warnLogger ) {
652	$warnLogger->warning( $msg );
653	} else {
654	// Strict mode requested but no deprecation logger provided
655	throw new \Error( $msg );
656	}
657	}
658	// This map is dumped from
659	// LanguageCode::getNonstandardLanguageCodeMapping() in core.
660	// Hopefully we will be able to deprecate and remove this method
661	// from Parsoid quickly enough that keeping it in sync with upstream
662	// will not be an issue.
663	static $MAP = [
664	"als" => "gsw",
665	"bat-smg" => "sgs",
666	"be-x-old" => "be-tarask",
667	"fiu-vro" => "vro",
668	"roa-rup" => "rup",
669	"zh-classical" => "lzh",
670	"zh-min-nan" => "nan",
671	"zh-yue" => "yue",
672	"cbk-zam" => "cbk",
673	"de-formal" => "de-x-formal",
674	"eml" => "egl",
675	"en-rtl" => "en-x-rtl",
676	"es-formal" => "es-x-formal",
677	"hu-formal" => "hu-x-formal",
678	"map-bms" => "jv-x-bms",
679	"mo" => "ro-Cyrl-MD",
680	"nrm" => "nrf",
681	"nl-informal" => "nl-x-informal",
682	"roa-tara" => "nap-x-tara",
683	"simple" => "en-simple",
684	"sr-ec" => "sr-Cyrl",
685	"sr-el" => "sr-Latn",
686	"zh-cn" => "zh-Hans-CN",
687	"zh-sg" => "zh-Hans-SG",
688	"zh-my" => "zh-Hans-MY",
689	"zh-tw" => "zh-Hant-TW",
690	"zh-hk" => "zh-Hant-HK",
691	"zh-mo" => "zh-Hant-MO",
692	];
693	$code = $MAP[$code] ?? $code;
694	// The rest of this code is copied verbatim from LanguageCode::bcp47()
695	// in core.
696	$codeSegment = explode( '-', $code );
697	$codeBCP = [];
698	foreach ( $codeSegment as $segNo => $seg ) {
699	// when previous segment is x, it is a private segment and should be lc
700	if ( $segNo > 0 && strtolower( $codeSegment[( $segNo - 1 )] ) == 'x' ) {
701	$codeBCP[$segNo] = strtolower( $seg );
702	// ISO 3166 country code
703	} elseif ( ( strlen( $seg ) == 2 ) && ( $segNo > 0 ) ) {
704	$codeBCP[$segNo] = strtoupper( $seg );
705	// ISO 15924 script code
706	} elseif ( ( strlen( $seg ) == 4 ) && ( $segNo > 0 ) ) {
707	$codeBCP[$segNo] = ucfirst( strtolower( $seg ) );
708	// Use lowercase for other cases
709	} else {
710	$codeBCP[$segNo] = strtolower( $seg );
711	}
712	}
713	return new Bcp47CodeValue( implode( '-', $codeBCP ) );
714	}
715
716	/**
717	* BCP 47 codes are case-insensitive, so this helper does a "proper"
718	* comparison of Bcp47Code objects.
719	* @param Bcp47Code $a
720	* @param Bcp47Code $b
721	* @return bool true iff $a and $b represent the same language
722	*/
723	public static function isBcp47CodeEqual( Bcp47Code $a, Bcp47Code $b ): bool {
724	return strcasecmp( $a->toBcp47Code(), $b->toBcp47Code() ) === 0;
725	}
726	}