Code Coverage for /src/src/Core/Sanitizer.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	5.44% covered (danger)	5.44%	24 / 441	3.03% covered (danger)	3.03%	1 / 33	CRAP	0.00% covered (danger)	0.00%	0 / 1
Sanitizer	5.44% covered (danger)	5.44%	24 / 441	3.03% covered (danger)	3.03%	1 / 33	19172.82	0.00% covered (danger)	0.00%	0 / 1
attributesAllowedInternal	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	2
setupAttributesAllowedInternal	0.00% covered (danger)	0.00%	0 / 127	0.00% covered (danger)	0.00%	0 / 1	6
normalizeCharReferences	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	2
normalizeCharReferencesCallback	0.00% covered (danger)	0.00%	0 / 10	0.00% covered (danger)	0.00%	0 / 1	30
normalizeEntity	0.00% covered (danger)	0.00%	0 / 9	0.00% covered (danger)	0.00%	0 / 1	20
decCharReference	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	6
hexCharReference	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	12
validateCodepoint	0.00% covered (danger)	0.00%	0 / 6	0.00% covered (danger)	0.00%	0 / 1	110
codepointToUtf8	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	2
utf8ToCodepoint	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	2
stripIDNs	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
cleanUrl	0.00% covered (danger)	0.00%	0 / 20	0.00% covered (danger)	0.00%	0 / 1	42
decodeEntity	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	6
decodeChar	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	6
decodeCharReferences	100.00% covered (success)	100.00%	16 / 16	100.00% covered (success)	100.00%	1 / 1	5
normalizeCss	0.00% covered (danger)	0.00%	0 / 18	0.00% covered (danger)	0.00%	0 / 1	20
delimiterReplaceCallback	0.00% covered (danger)	0.00%	0 / 45	0.00% covered (danger)	0.00%	0 / 1	182
delimiterReplace	0.00% covered (danger)	0.00%	0 / 7	0.00% covered (danger)	0.00%	0 / 1	2
isParsoidAttr	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	42
isReservedDataAttribute	66.67% covered (warning)	66.67%	2 / 3	0.00% covered (danger)	0.00%	0 / 1	2.15
sanitizeTagAttrs	0.00% covered (danger)	0.00%	0 / 74	0.00% covered (danger)	0.00%	0 / 1	2070
applySanitizedArgs	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	12
checkCss	85.71% covered (warning)	85.71%	6 / 7	0.00% covered (danger)	0.00%	0 / 1	4.05
cssDecodeCallback	0.00% covered (danger)	0.00%	0 / 10	0.00% covered (danger)	0.00%	0 / 1	72
sanitizeTitleURI	0.00% covered (danger)	0.00%	0 / 14	0.00% covered (danger)	0.00%	0 / 1	20
armorFrenchSpaces	0.00% covered (danger)	0.00%	0 / 8	0.00% covered (danger)	0.00%	0 / 1	2
escapeIdForAttribute	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	6
escapeIdForLink	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
escapeIdForExternalInterwiki	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
escapeIdInternalUrl	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	6
escapeIdInternal	0.00% covered (danger)	0.00%	0 / 13	0.00% covered (danger)	0.00%	0 / 1	20
escapeIdReferenceList	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	6
normalizeSectionNameWhiteSpace	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2

1	<?php
2	declare( strict_types = 1 );
3
4	/**
5	* General token sanitizer. Strips out (or encapsulates) unsafe and disallowed
6	* tag types and attributes. Should run last in the third, synchronous
7	* expansion stage.
8	*
9	* FIXME: This code was originally ported from PHP to JS in 2012
10	* and periodically updated before being back to PHP. This code should be
11	* (a) resynced with core sanitizer changes (b) updated to use HTML5 spec
12	*/
13
14	namespace Wikimedia\Parsoid\Core;
15
16	use InvalidArgumentException;
17	use Wikimedia\Assert\Assert;
18	use Wikimedia\Parsoid\Config\SiteConfig;
19	use Wikimedia\Parsoid\DOM\Element;
20	use Wikimedia\Parsoid\Tokens\KV;
21	use Wikimedia\Parsoid\Tokens\Token;
22	use Wikimedia\Parsoid\Utils\DOMCompat;
23	use Wikimedia\Parsoid\Utils\DOMUtils;
24	use Wikimedia\Parsoid\Utils\PHPUtils;
25	use Wikimedia\Parsoid\Utils\TokenUtils;
26	use Wikimedia\RemexHtml\HTMLData;
27
28	class Sanitizer {
29	/**
30	* RDFa and microdata properties allow URLs, URIs and/or CURIs.
31	*/
32	private const MICRODATA = [
33	'rel' => true,
34	'rev' => true,
35	'about' => true,
36	'property' => true,
37	'resource' => true,
38	'datatype' => true,
39	'typeof' => true, // RDFa
40	'itemid' => true,
41	'itemprop' => true,
42	'itemref' => true,
43	'itemscope' => true,
44	'itemtype' => true,
45	];
46
47	private const UTF8_REPLACEMENT = "\u{FFFD}";
48
49	/**
50	* Regular expression to match various types of character references in
51	* Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
52	*/
53	private const CHAR_REFS_REGEX =
54	'/&([A-Za-z0-9\x80-\xff]+;)
55	\|&\#([0-9]+);
56	\|&\#[xX]([0-9A-Fa-f]+);
57	\|&/x';
58
59	private const INSECURE_RE = '! expression
60	\| accelerator\s*:
61	\| -o-link\s*:
62	\| -o-link-source\s*:
63	\| -o-replace\s*:
64	\| url\s*\(
65	\| src\s*\(
66	\| image\s*\(
67	\| image-set\s*\(
68	\| attr\s*\([^)]+[\s,]+url
69	!ix';
70
71	/**
72	* Pattern matching evil uris like javascript:
73	* WARNING: DO NOT use this in any place that actually requires denying
74	* certain URIs for security reasons. There are NUMEROUS[1] ways to bypass
75	* pattern-based deny lists; the only way to be secure from javascript:
76	* uri based xss vectors is to allow only things that you know are safe
77	* and deny everything else.
78	* [1]: http://ha.ckers.org/xss.html
79	*/
80	private const EVIL_URI_PATTERN = '!(^\|\s\|\/\s)(javascript\|vbscript)(\W\|$)!iD';
81	private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/D";
82
83	/**
84	* Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
85	*
86	* @since 1.30
87	*/
88	public const ID_PRIMARY = 0;
89
90	/**
91	* Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
92	* if no fallback is configured.
93	*
94	* @since 1.30
95	*/
96	public const ID_FALLBACK = 1; // public because it is accessed in Headings handler
97
98	/** Characters that will be ignored in IDNs.
99	* https://datatracker.ietf.org/doc/html/rfc8264#section-9.13
100	* https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
101	* Strip them before further processing so deny lists and such work.
102	* Part of Sanitizer::cleanUrl in core.
103	*/
104	private const IDN_RE_G = "/
105	\\s\| # general whitespace
106	\u{00AD}\| # SOFT HYPHEN
107	\u{034F}\| # COMBINING GRAPHEME JOINER
108	\u{061C}\| # ARABIC LETTER MARK
109	[\u{115F}-\u{1160}]\| # HANGUL CHOSEONG FILLER..
110	# HANGUL JUNGSEONG FILLER
111	[\u{17B4}-\u{17B5}]\| # KHMER VOWEL INHERENT AQ..
112	# KHMER VOWEL INHERENT AA
113	[\u{180B}-\u{180D}]\| # MONGOLIAN FREE VARIATION SELECTOR ONE..
114	# MONGOLIAN FREE VARIATION SELECTOR THREE
115	\u{180E}\| # MONGOLIAN VOWEL SEPARATOR
116	[\u{200B}-\u{200F}]\| # ZERO WIDTH SPACE..
117	# RIGHT-TO-LEFT MARK
118	[\u{202A}-\u{202E}]\| # LEFT-TO-RIGHT EMBEDDING..
119	# RIGHT-TO-LEFT OVERRIDE
120	[\u{2060}-\u{2064}]\| # WORD JOINER..
121	# INVISIBLE PLUS
122	\u{2065}\| # <reserved-2065>
123	[\u{2066}-\u{206F}]\| # LEFT-TO-RIGHT ISOLATE..
124	# NOMINAL DIGIT SHAPES
125	\u{3164}\| # HANGUL FILLER
126	[\u{FE00}-\u{FE0F}]\| # VARIATION SELECTOR-1..
127	# VARIATION SELECTOR-16
128	\u{FEFF}\| # ZERO WIDTH NO-BREAK SPACE
129	\u{FFA0}\| # HALFWIDTH HANGUL FILLER
130	[\u{FFF0}-\u{FFF8}]\| # <reserved-FFF0>..
131	# <reserved-FFF8>
132	[\u{1BCA0}-\u{1BCA3}]\| # SHORTHAND FORMAT LETTER OVERLAP..
133	# SHORTHAND FORMAT UP STEP
134	[\u{1D173}-\u{1D17A}]\| # MUSICAL SYMBOL BEGIN BEAM..
135	# MUSICAL SYMBOL END PHRASE
136	\u{E0000}\| # <reserved-E0000>
137	\u{E0001}\| # LANGUAGE TAG
138	[\u{E0002}-\u{E001F}]\| # <reserved-E0002>..
139	# <reserved-E001F>
140	[\u{E0020}-\u{E007F}]\| # TAG SPACE..
141	# CANCEL TAG
142	[\u{E0080}-\u{E00FF}]\| # <reserved-E0080>..
143	# <reserved-E00FF>
144	[\u{E0100}-\u{E01EF}]\| # VARIATION SELECTOR-17..
145	# VARIATION SELECTOR-256
146	[\u{E01F0}-\u{E0FFF}]\| # <reserved-E01F0>..
147	# <reserved-E0FFF>
148	/xuD";
149
150	private const GET_ATTRIBS_RE = '/^[:_\p{L}\p{N}][:_\.\-\p{L}\p{N}]*$/uD';
151
152	/**
153	* Character entity aliases accepted by MediaWiki in wikitext.
154	* These are not part of the HTML standard.
155	*/
156	private const MW_ENTITY_ALIASES = [
157	'רלמ;' => 'rlm;',
158	'رلم;' => 'rlm;',
159	];
160
161	/**
162	* Fetch the list of acceptable attributes for a given element name.
163	*
164	* @param string $element
165	* @return array<string,int>
166	*/
167	public static function attributesAllowedInternal( string $element ): array {
168	// PORT-FIXME: this method is private in core, but used by Gallery
169	$lists = self::setupAttributesAllowedInternal();
170	$list = $lists[$element] ?? [];
171	return array_flip( $list );
172	}
173
174	/**
175	* Foreach array key (an allowed HTML element), return an array
176	* of allowed attributes
177	* @return array<string,string[]>
178	*/
179	private static function setupAttributesAllowedInternal(): array {
180	static $allowed;
181
182	if ( $allowed !== null ) {
183	return $allowed;
184	}
185
186	$common = [
187	# HTML
188	'id',
189	'class',
190	'style',
191	'lang',
192	'dir',
193	'title',
194	'tabindex',
195
196	# WAI-ARIA
197	'aria-describedby',
198	'aria-flowto',
199	'aria-hidden',
200	'aria-label',
201	'aria-labelledby',
202	'aria-level',
203	'aria-owns',
204	'role',
205
206	# RDFa
207	# These attributes are specified in section 9 of
208	# https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
209	'about',
210	'property',
211	'resource',
212	'datatype',
213	'typeof',
214
215	# Microdata. These are specified by
216	# https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
217	'itemid',
218	'itemprop',
219	'itemref',
220	'itemscope',
221	'itemtype',
222	];
223
224	$block = array_merge( $common, [ 'align' ] );
225	$tablealign = [ 'align', 'valign' ];
226	$tablecell = [
227	'abbr',
228	'axis',
229	'headers',
230	'scope',
231	'rowspan',
232	'colspan',
233	'nowrap', # deprecated
234	'width', # deprecated
235	'height', # deprecated
236	'bgcolor', # deprecated
237	];
238
239	# Numbers refer to sections in HTML 4.01 standard describing the element.
240	# See: https://www.w3.org/TR/html4/
241	$allowed = [
242	# 7.5.4
243	'div' => $block,
244	'center' => $common, # deprecated
245	'span' => $common,
246
247	# 7.5.5
248	'h1' => $block,
249	'h2' => $block,
250	'h3' => $block,
251	'h4' => $block,
252	'h5' => $block,
253	'h6' => $block,
254
255	# 7.5.6
256	# address
257
258	# 8.2.4
259	'bdo' => $common,
260
261	# 9.2.1
262	'em' => $common,
263	'strong' => $common,
264	'cite' => $common,
265	'dfn' => $common,
266	'code' => $common,
267	'samp' => $common,
268	'kbd' => $common,
269	'var' => $common,
270	'abbr' => $common,
271	# acronym
272
273	# 9.2.2
274	'blockquote' => array_merge( $common, [ 'cite' ] ),
275	'q' => array_merge( $common, [ 'cite' ] ),
276
277	# 9.2.3
278	'sub' => $common,
279	'sup' => $common,
280
281	# 9.3.1
282	'p' => $block,
283
284	# 9.3.2
285	'br' => array_merge( $common, [ 'clear' ] ),
286
287	# https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
288	'wbr' => $common,
289
290	# 9.3.4
291	'pre' => array_merge( $common, [ 'width' ] ),
292
293	# 9.4
294	'ins' => array_merge( $common, [ 'cite', 'datetime' ] ),
295	'del' => array_merge( $common, [ 'cite', 'datetime' ] ),
296
297	# 10.2
298	'ul' => array_merge( $common, [ 'type' ] ),
299	'ol' => array_merge( $common, [ 'type', 'start', 'reversed' ] ),
300	'li' => array_merge( $common, [ 'type', 'value' ] ),
301
302	# 10.3
303	'dl' => $common,
304	'dd' => $common,
305	'dt' => $common,
306
307	# 11.2.1
308	'table' => array_merge( $common,
309	[ 'summary', 'width', 'border', 'frame',
310	'rules', 'cellspacing', 'cellpadding',
311	'align', 'bgcolor',
312	] ),
313
314	# 11.2.2
315	'caption' => $block,
316
317	# 11.2.3
318	'thead' => $common,
319	'tfoot' => $common,
320	'tbody' => $common,
321
322	# 11.2.4
323	'colgroup' => array_merge( $common, [ 'span' ] ),
324	'col' => array_merge( $common, [ 'span' ] ),
325
326	# 11.2.5
327	'tr' => array_merge( $common, [ 'bgcolor' ], $tablealign ),
328
329	# 11.2.6
330	'td' => array_merge( $common, $tablecell, $tablealign ),
331	'th' => array_merge( $common, $tablecell, $tablealign ),
332
333	# 12.2
334	# NOTE: <a> is not allowed directly, but this list of allowed
335	# attributes is used from the Parser object
336	'a' => array_merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
337
338	# 13.2
339	# Not usually allowed, but may be used for extension-style hooks
340	# such as <math> when it is rasterized, or if $wgAllowImageTag is
341	# true
342	'img' => array_merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
343	# Attributes for A/V tags added in T163583 / T133673
344	'audio' => array_merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
345	'video' => array_merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
346	'source' => array_merge( $common, [ 'type', 'src' ] ),
347	'track' => array_merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
348
349	# 15.2.1
350	'tt' => $common,
351	'b' => $common,
352	'i' => $common,
353	'big' => $common,
354	'small' => $common,
355	'strike' => $common,
356	's' => $common,
357	'u' => $common,
358
359	# 15.2.2
360	'font' => array_merge( $common, [ 'size', 'color', 'face' ] ),
361	# basefont
362
363	# 15.3
364	'hr' => array_merge( $common, [ 'width' ] ),
365
366	# HTML Ruby annotation text module, simple ruby only.
367	# https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
368	'ruby' => $common,
369	# rbc
370	'rb' => $common,
371	'rp' => $common,
372	'rt' => $common, # array_merge( $common, array( 'rbspan' ) ),
373	'rtc' => $common,
374
375	# MathML root element, where used for extensions
376	# 'title' may not be 100% valid here; it's XHTML
377	# https://www.w3.org/TR/REC-MathML/
378	'math' => [ 'class', 'style', 'id', 'title' ],
379
380	// HTML 5 section 4.5
381	'figure' => $common,
382	'figcaption' => $common,
383
384	# HTML 5 section 4.6
385	'bdi' => $common,
386
387	# HTML5 elements, defined by:
388	# https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
389	'data' => array_merge( $common, [ 'value' ] ),
390	'time' => array_merge( $common, [ 'datetime' ] ),
391	'mark' => $common,
392
393	// meta and link are only permitted by removeHTMLtags when Microdata
394	// is enabled so we don't bother adding a conditional to hide these
395	// Also meta and link are only valid in WikiText as Microdata elements
396	// (ie: validateTag rejects tags missing the attributes needed for Microdata)
397	// So we don't bother including $common attributes that have no purpose.
398	'meta' => [ 'itemprop', 'content' ],
399	'link' => [ 'itemprop', 'href', 'title' ],
400
401	// HTML 5 section 4.3.5
402	'aside' => $common,
403	];
404
405	return $allowed;
406	}
407
408	/**
409	* Ensure that any entities and character references are legal
410	* for XML and XHTML specifically. Any stray bits will be
411	* &-escaped to result in a valid text fragment.
412	*
413	* a. named char refs can only be < > & ", others are
414	* numericized (this way we're well-formed even without a DTD)
415	* b. any numeric char refs must be legal chars, not invalid or forbidden
416	* c. use lower cased "&#x", not "&#X"
417	* d. fix or reject non-valid attributes
418	*
419	* @param string $text
420	* @return string
421	* @internal
422	*/
423	public static function normalizeCharReferences( string $text ): string {
424	return preg_replace_callback(
425	self::CHAR_REFS_REGEX,
426	[ self::class, 'normalizeCharReferencesCallback' ],
427	$text, -1, $count, PREG_UNMATCHED_AS_NULL
428	);
429	}
430
431	/**
432	* @param array $matches
433	* @return string
434	*/
435	private static function normalizeCharReferencesCallback( array $matches ): string {
436	$ret = null;
437	if ( isset( $matches[1] ) ) {
438	$ret = self::normalizeEntity( $matches[1] );
439	} elseif ( isset( $matches[2] ) ) {
440	$ret = self::decCharReference( $matches[2] );
441	} elseif ( isset( $matches[3] ) ) {
442	$ret = self::hexCharReference( $matches[3] );
443	}
444	if ( $ret === null ) {
445	return htmlspecialchars( $matches[0] );
446	} else {
447	return $ret;
448	}
449	}
450
451	/**
452	* If the named entity is defined in HTML5
453	* return the equivalent numeric entity reference (except for the core <
454	* > & "). If the entity is a MediaWiki-specific alias, returns
455	* the HTML equivalent. Otherwise, returns HTML-escaped text of
456	* pseudo-entity source (eg &foo;)
457	*
458	* @param string $name Semicolon-terminated name
459	* @return string
460	*/
461	private static function normalizeEntity( string $name ): string {
462	if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
463	// Non-standard MediaWiki-specific entities
464	return '&' . self::MW_ENTITY_ALIASES[$name];
465	} elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
466	// Keep these in word form
467	return "&$name";
468	} elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
469	// Beware: some entities expand to more than 1 codepoint
470	return preg_replace_callback( '/./Ssu', function ( $m ) {
471	return '&#' . self::utf8ToCodepoint( $m[0] ) . ';';
472	}, HTMLData::$namedEntityTranslations[$name] );
473	} else {
474	return "&$name";
475	}
476	}
477
478	/**
479	* @param string $codepoint
480	* @return null\|string
481	*/
482	private static function decCharReference( string $codepoint ): ?string {
483	# intval() will (safely) saturate at the maximum signed integer
484	# value if $codepoint is too many digits
485	$point = intval( $codepoint );
486	if ( self::validateCodepoint( $point ) ) {
487	return "&#$point;";
488	} else {
489	return null;
490	}
491	}
492
493	/**
494	* @param string $codepoint
495	* @return null\|string
496	*/
497	private static function hexCharReference( string $codepoint ): ?string {
498	$point = hexdec( $codepoint );
499	// hexdec() might return a float if the string is too long
500	if ( is_int( $point ) && self::validateCodepoint( $point ) ) {
501	return sprintf( '&#x%x;', $point );
502	} else {
503	return null;
504	}
505	}
506
507	/**
508	* Returns true if a given Unicode codepoint is a valid character in
509	* both HTML5 and XML.
510	* @param int $codepoint
511	* @return bool
512	*/
513	private static function validateCodepoint( int $codepoint ): bool {
514	# U+000C is valid in HTML5 but not allowed in XML.
515	# U+000D is valid in XML but not allowed in HTML5.
516	# U+007F - U+009F are disallowed in HTML5 (control characters).
517	return $codepoint == 0x09
518	\|\| $codepoint == 0x0a
519	\|\| ( $codepoint >= 0x20 && $codepoint <= 0x7e )
520	\|\| ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
521	\|\| ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
522	\|\| ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
523	}
524
525	/**
526	* Returns a string from the provided code point.
527	*
528	* @param int $cp
529	* @return string
530	*/
531	private static function codepointToUtf8( int $cp ): string {
532	$chr = mb_chr( $cp, 'UTF-8' );
533	Assert::invariant( $chr !== false, "Getting char failed!" );
534	return $chr;
535	}
536
537	/**
538	* Returns the code point at the first position of the string.
539	*
540	* @param string $str
541	* @return int
542	*/
543	private static function utf8ToCodepoint( string $str ): int {
544	$ord = mb_ord( $str );
545	Assert::invariant( $ord !== false, "Getting code point failed!" );
546	return $ord;
547	}
548
549	/**
550	* @param string $host
551	* @return string
552	*/
553	private static function stripIDNs( string $host ): string {
554	// This code is part of Sanitizer::cleanUrl in core
555	return preg_replace( self::IDN_RE_G, '', $host );
556	}
557
558	/**
559	* @param SiteConfig $siteConfig
560	* @param string $href
561	* @param string $mode
562	* @return string\|null
563	*/
564	public static function cleanUrl( SiteConfig $siteConfig, string $href, string $mode ): ?string {
565	if ( $mode !== 'wikilink' ) {
566	$href = preg_replace_callback(
567	'/([\][<>"\x00-\x20\x7F\\|])/', static function ( $matches ) {
568	return urlencode( $matches[0] );
569	}, $href
570	);
571	}
572
573	$matched = preg_match( '#^((?:[a-zA-Z][^:/]:)?(?://)?)([^/]+)(/?.)#', $href, $bits );
574	if ( $matched === 1 ) {
575	$proto = $bits[1];
576	if ( $proto && !$siteConfig->hasValidProtocol( $proto ) ) {
577	// invalid proto, disallow URL
578	return null;
579	}
580	$host = self::stripIDNs( $bits[2] );
581	preg_match( '/^%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$/D', $host, $match );
582	if ( $match ) {
583	// IPv6 host names
584	$host = '[' . $match[1] . ']' . $match[2];
585	}
586	$path = $bits[3];
587	} else {
588	$proto = '';
589	$host = '';
590	$path = $href;
591	}
592	return $proto . $host . $path;
593	}
594
595	/**
596	* If the named entity is defined in HTML5
597	* return the UTF-8 encoding of that character. Otherwise, returns
598	* pseudo-entity source (eg "&foo;")
599	*
600	* @param string $name Semicolon-terminated entity name
601	* @return string
602	*/
603	private static function decodeEntity( string $name ): string {
604	// These are MediaWiki-specific entities, not in the HTML standard
605	if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
606	$name = self::MW_ENTITY_ALIASES[$name];
607	}
608	$trans = HTMLData::$namedEntityTranslations[$name] ?? null;
609	return $trans ?? "&$name";
610	}
611
612	/**
613	* Return UTF-8 string for a codepoint if that is a valid
614	* character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
615	* @param int $codepoint
616	* @return string
617	*/
618	private static function decodeChar( int $codepoint ): string {
619	if ( self::validateCodepoint( $codepoint ) ) {
620	return self::codepointToUtf8( $codepoint );
621	} else {
622	return self::UTF8_REPLACEMENT;
623	}
624	}
625
626	/**
627	* Decode any character references, numeric or named entities,
628	* in the text and return a UTF-8 string.
629	* @param string $text
630	* @return string
631	*/
632	public static function decodeCharReferences( string $text ): string {
633	return preg_replace_callback(
634	self::CHAR_REFS_REGEX,
635	function ( $matches ) {
636	if ( isset( $matches[1] ) ) {
637	return self::decodeEntity( $matches[1] );
638	} elseif ( isset( $matches[2] ) ) {
639	return self::decodeChar( intval( $matches[2] ) );
640	} elseif ( isset( $matches[3] ) ) {
641	$point = hexdec( $matches[3] );
642	// hexdec() might return a float if the string is too long
643	if ( !is_int( $point ) ) {
644	// Invalid character reference.
645	return self::UTF8_REPLACEMENT;
646	}
647	return self::decodeChar( $point );
648	}
649	# Last case should be an ampersand by itself
650	return $matches[0];
651	},
652	$text, -1, $count, PREG_UNMATCHED_AS_NULL
653	);
654	}
655
656	/**
657	* Normalize CSS into a format we can easily search for hostile input
658	* - decode character references
659	* - decode escape sequences
660	* - convert characters that IE6 interprets into ascii
661	* - remove comments, unless the entire value is one single comment
662	* @param string $value the css string
663	* @return string normalized css
664	*/
665	public static function normalizeCss( string $value ): string {
666	// Decode character references like {
667	$value = self::decodeCharReferences( $value );
668
669	// Decode escape sequences and line continuation
670	// See the grammar in the CSS 2 spec, appendix D.
671	// This has to be done AFTER decoding character references.
672	// This means it isn't possible for this function to return
673	// unsanitized escape sequences. It is possible to manufacture
674	// input that contains character references that decode to
675	// escape sequences that decode to character references, but
676	// it's OK for the return value to contain character references
677	// because the caller is supposed to escape those anyway.
678	static $decodeRegex;
679	if ( !$decodeRegex ) {
680	$space = '[\\x20\\t\\r\\n\\f]';
681	$nl = '(?:\\n\|\\r\\n\|\\r\|\\f)';
682	$backslash = '\\\\';
683	$decodeRegex = "/ $backslash
684	(?:
685	($nl) \| # 1. Line continuation
686	([0-9A-Fa-f]{1,6})$space? \| # 2. character number
687	(.) \| # 3. backslash cancelling special meaning
688	() \| # 4. backslash at end of string
689	)/xu";
690	}
691	$value = preg_replace_callback( $decodeRegex,
692	[ self::class, 'cssDecodeCallback' ], $value );
693
694	// Let the value through if it's nothing but a single comment, to
695	// allow other functions which may reject it to pass some error
696	// message through.
697	if ( !preg_match( '! ^ \s* /\* [^\\/] \/ \s $ !xD', $value ) ) {
698	// Remove any comments; IE gets token splitting wrong
699	// This must be done AFTER decoding character references and
700	// escape sequences, because those steps can introduce comments
701	// This step cannot introduce character references or escape
702	// sequences, because it replaces comments with spaces rather
703	// than removing them completely.
704	$value = self::delimiterReplace( '/', '/', ' ', $value );
705
706	// Remove anything after a comment-start token, to guard against
707	// incorrect client implementations.
708	$commentPos = strpos( $value, '/*' );
709	if ( $commentPos !== false ) {
710	$value = substr( $value, 0, $commentPos );
711	}
712	}
713
714	return $value;
715	}
716
717	// PORT_FIXME - The delimiterReplace code below is from StringUtils in core
718
719	/**
720	* Perform an operation equivalent to `preg_replace_callback()`
721	*
722	* Matches this code:
723	*
724	* preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject );
725	*
726	* If the start delimiter ends with an initial substring of the end delimiter,
727	* e.g. in the case of C-style comments, the behavior differs from the model
728	* regex. In this implementation, the end must share no characters with the
729	* start, so e.g. `/*\/` is not considered to be both the start and end of a
730	* comment. `/\/xy/\/` is considered to be a single comment with contents `/xy/`.
731	*
732	* The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
733	* but uses far less memory. The delimiters are literal strings, not regular expressions.
734	*
735	* @param string $startDelim Start delimiter
736	* @param string $endDelim End delimiter
737	* @param callable $callback Function to call on each match
738	* @param string $subject
739	* @param string $flags Regular expression flags
740	* @throws InvalidArgumentException
741	* @return string
742	*/
743	private static function delimiterReplaceCallback(
744	string $startDelim, string $endDelim, callable $callback, string $subject, string $flags = ''
745	): string {
746	$inputPos = 0;
747	$outputPos = 0;
748	$contentPos = 0;
749	$output = '';
750	$foundStart = false;
751	$encStart = preg_quote( $startDelim, '!' );
752	$encEnd = preg_quote( $endDelim, '!' );
753	$strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
754	$endLength = strlen( $endDelim );
755	$m = [];
756	while ( $inputPos < strlen( $subject ) &&
757	preg_match( "!($encStart)\|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
758	) {
759	$tokenOffset = $m[0][1];
760	if ( $m[1][0] !== '' ) {
761	if ( $foundStart &&
762	$strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) === 0
763	) {
764	# An end match is present at the same location
765	$tokenType = 'end';
766	$tokenLength = $endLength;
767	} else {
768	$tokenType = 'start';
769	$tokenLength = strlen( $m[0][0] );
770	}
771	} elseif ( $m[2][0] !== '' ) {
772	$tokenType = 'end';
773	$tokenLength = strlen( $m[0][0] );
774	} else {
775	throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
776	}
777	if ( $tokenType === 'start' ) {
778	# Only move the start position if we haven't already found a start
779	# This means that START START END matches outer pair
780	if ( !$foundStart ) {
781	# Found start
782	$inputPos = $tokenOffset + $tokenLength;
783	# Write out the non-matching section
784	$output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
785	$outputPos = $tokenOffset;
786	$contentPos = $inputPos;
787	$foundStart = true;
788	} else {
789	# Move the input position past the first character of START,
790	# to protect against missing END when it overlaps with START
791	$inputPos = $tokenOffset + 1;
792	}
793	} elseif ( $tokenType === 'end' ) {
794	if ( $foundStart ) {
795	# Found match
796	$output .= $callback( [
797	substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
798	substr( $subject, $contentPos, $tokenOffset - $contentPos )
799	] );
800	$foundStart = false;
801	} else {
802	# Non-matching end, write it out
803	$output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
804	}
805	$inputPos = $outputPos = $tokenOffset + $tokenLength;
806	} else {
807	throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
808	}
809	}
810	if ( $outputPos < strlen( $subject ) ) {
811	$output .= substr( $subject, $outputPos );
812	}
813	return $output;
814	}
815
816	/**
817	* Perform an operation equivalent to `preg_replace()` with flags.
818	*
819	* Matches this code:
820	*
821	* preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject );
822	*
823	* @param string $startDelim Start delimiter regular expression
824	* @param string $endDelim End delimiter regular expression
825	* @param string $replace Replacement string. May contain $1, which will be
826	* replaced by the text between the delimiters
827	* @param string $subject String to search
828	* @param string $flags Regular expression flags
829	* @return string The string with the matches replaced
830	*/
831	private static function delimiterReplace(
832	string $startDelim, string $endDelim, string $replace, string $subject, string $flags = ''
833	): string {
834	return self::delimiterReplaceCallback(
835	$startDelim, $endDelim,
836	static function ( array $matches ) use ( $replace ) {
837	return strtr( $replace, [ '$0' => $matches[0], '$1' => $matches[1] ] );
838	},
839	$subject, $flags
840	);
841	}
842
843	/**
844	* SSS FIXME: There is a test in mediawiki.environment.js that doles out
845	* and tests about ids. There are probably some tests in Util.php as well.
846	* We should move all these kind of tests somewhere else.
847	* @param string $k
848	* @param string $v
849	* @param KV[] $attrs
850	* @return bool
851	*/
852	private static function isParsoidAttr( string $k, string $v, array $attrs ): bool {
853	// NOTES:
854	// 1. Currently the tokenizer unconditionally escapes typeof and about
855	// attributes from wikitxt to data-x-typeof and data-x-about. So,
856	// this check will only pass through Parsoid inserted attrs.
857	// 2. But, if we fix the over-aggressive escaping in the tokenizer to
858	// not escape non-Parsoid typeof and about, then this will return
859	// true for something like typeof='mw:Foo evilScriptHere'. But, that
860	// is safe since this check is only used to see if we should
861	// unconditionally discard the entire attribute or process it further.
862	// That further processing will catch and discard any dangerous
863	// strings in the rest of the attribute
864	return ( in_array( $k, [ 'typeof', 'property', 'rel' ], true )
865	&& preg_match( '/(?:^\|\s)mw:.+?(?=$\|\s)/D', $v ) )
866	\|\| ( $k === 'about' && preg_match( '/^#mwt\d+$/D', $v ) )
867	\|\| ( $k === 'content'
868	&& preg_match( '/(?:^\|\s)mw:.+?(?=$\|\s)/D', KV::lookup( $attrs, 'property' ) ?? '' ) );
869	}
870
871	/**
872	* Given an attribute name, checks whether it is a reserved data attribute
873	* (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki
874	* core and extension code can safely use it to communicate with frontend code.
875	* @param string $attr Attribute name.
876	* @return bool
877	*/
878	public static function isReservedDataAttribute( string $attr ): bool {
879	// data-ooui is reserved for ooui.
880	// data-mw and data-parsoid are reserved for parsoid.
881	// data-mw-<name here> is reserved for extensions (or core) if
882	// they need to communicate some data to the client and want to be
883	// sure that it isn't coming from an untrusted user.
884	// We ignore the possibility of namespaces since user-generated HTML
885	// can't use them anymore.
886	if ( preg_match( '/^data-(mw\|parsoid)/', $attr ) ) {
887	return false; // PARSOID SPECIFIC
888	}
889	return (bool)preg_match( '/^data-(ooui\|mw\|parsoid)/i', $attr );
890	}
891
892	/**
893	* @param SiteConfig $siteConfig
894	* @param ?string $tagName
895	* @param ?Token $token
896	* @param array $attrs
897	* @return array
898	*/
899	public static function sanitizeTagAttrs(
900	SiteConfig $siteConfig, ?string $tagName, ?Token $token, array $attrs
901	): array {
902	$tag = $tagName ?: $token->getName();
903
904	$list = self::attributesAllowedInternal( $tag );
905	$newAttrs = [];
906	$n = count( $attrs );
907	for ( $i = 0; $i < $n; $i++ ) {
908	$a = $attrs[$i];
909	$a->v ??= '';
910
911	// Convert attributes to string, if necessary.
912	$a->k = TokenUtils::tokensToString( $a->k );
913
914	if ( is_array( $a->v ) ) {
915	// Use the expanded attr instead of trying to unpackDOMFragments
916	// since the fragment will have been released when expanding to DOM
917	$expandedVal = $token ? $token->fetchExpandedAttrValue( $a->k ) : null;
918	if ( $expandedVal === null ) {
919	$a->v = TokenUtils::tokensToString( $a->v );
920	} else {
921	// See the comment in TokenUtils::tokensToString about
922	// unpackDOMFragments for why we're just using the textContent
923	$dom = DOMUtils::parseHTML( $expandedVal );
924	$a->v = DOMCompat::getBody( $dom )->textContent;
925	}
926	}
927
928	$origK = $a->ksrc ?? $a->k;
929	// $a->k can be uppercase
930	$k = mb_strtolower( $a->k );
931	$v = $a->v;
932	$origV = $a->vsrc ?? $v;
933	$psdAttr = self::isParsoidAttr( $k, $v, $attrs );
934
935	// Bypass RDFa/allowed attribute checks for Parsoid-inserted attrs
936	// Safe to do since the tokenizer renames about/typeof attrs.
937	// unconditionally. FIXME: The escaping solution in the tokenizer
938	// may be aggressive. There is no need to escape typeof strings
939	// that or about ids that don't resemble Parsoid tokens/about ids.
940	if ( !$psdAttr ) {
941	if ( !preg_match( self::GET_ATTRIBS_RE, $k ) ) {
942	$newAttrs[$k] = [ null, $origV, $origK ];
943	continue;
944	}
945
946	# Allow XML namespace declaration to allow RDFa
947	if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $k ) ) {
948	if ( !preg_match( self::EVIL_URI_PATTERN, $v ) ) {
949	$newAttrs[$k] = [ $v, $origV, $origK ];
950	} else {
951	$newAttrs[$k] = [ null, $origV, $origK ];
952	}
953	continue;
954	}
955
956	# Allow any attribute beginning with "data-"
957	# However:
958	# * Disallow data attributes used by MediaWiki code
959	# * Ensure that the attribute is not namespaced by banning
960	# colons.
961	if ( ( !preg_match( '/^data-[^:]*$/iD', $k ) && !isset( $list[$k] ) )
962	\|\| self::isReservedDataAttribute( $k )
963	) {
964	$newAttrs[$k] = [ null, $origV, $origK ];
965	continue;
966	}
967	}
968
969	# Strip javascript "expression" from stylesheets.
970	# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
971	if ( $k === 'style' ) {
972	$v = self::checkCss( $v );
973	}
974
975	# Escape HTML id attributes
976	if ( $k === 'id' ) {
977	$v = self::escapeIdForAttribute( $v, self::ID_PRIMARY );
978	if ( $v === '' ) {
979	$newAttrs[$k] = [ null, $origV, $origK ];
980	continue;
981	}
982	}
983
984	# Escape HTML id reference lists
985	if ( $k === 'aria-describedby'
986	\|\| $k === 'aria-flowto'
987	\|\| $k === 'aria-labelledby'
988	\|\| $k === 'aria-owns'
989	) {
990	$v = self::escapeIdReferenceList( $v );
991	}
992
993	// RDFa and microdata properties allow URLs, URIs and/or CURIs.
994	// Check them for validity.
995	if ( $k === 'rel' \|\| $k === 'rev'
996	# RDFa
997	\|\| $k === 'about' \|\| $k === 'property'
998	\|\| $k === 'resource' \|\| $k === 'datatype'
999	\|\| $k === 'typeof'
1000	# HTML5 microdata
1001	\|\| $k === 'itemid' \|\| $k === 'itemprop'
1002	\|\| $k === 'itemref' \|\| $k === 'itemscope'
1003	\|\| $k === 'itemtype'
1004	) {
1005	// Paranoia. Allow "simple" values but suppress javascript
1006	if ( preg_match( self::EVIL_URI_PATTERN, $v ) ) {
1007	// Retain the Parsoid typeofs for Parsoid attrs
1008	$newV = $psdAttr ? trim( preg_replace( '/(?:^\|\s)(?!mw:\w)\S*/', '', $origV ) ) : null;
1009	$newAttrs[$k] = [ $newV, $origV, $origK ];
1010	continue;
1011	}
1012	}
1013
1014	# NOTE: even though elements using href/src are not allowed directly, supply
1015	# validation code that can be used by tag hook handlers, etc
1016	if ( $token && ( $k === 'href' \|\| $k === 'src' \|\| $k === 'poster' ) ) { // T163583
1017	// `origV` will always be `v`, because `a.vsrc` isn't set, since
1018	// this attribute didn't come from source. However, in the
1019	// LinkHandler, we may have already shadowed this value so use
1020	// that instead.
1021	$rel = $token->getAttributeShadowInfo( 'rel' );
1022	$mode = ( $k === 'href' &&
1023	isset( $rel['value'] ) &&
1024	preg_match( '#^mw:WikiLink(/Interwiki)?$#', $rel['value'] )
1025	) ? 'wikilink' : 'external';
1026	$origHref = $token->getAttributeShadowInfo( $k )['value'];
1027	$newHref = self::cleanUrl( $siteConfig, $v, $mode );
1028	if ( $newHref !== $v ) {
1029	$newAttrs[$k] = [ $newHref, $origHref, $origK ];
1030	continue;
1031	}
1032	}
1033
1034	if ( $k === 'tabindex' && $v !== '0' ) {
1035	// Only allow tabindex of 0, which is useful for accessibility.
1036	continue;
1037	}
1038
1039	// SSS FIXME: This logic is not RT-friendly.
1040	// If this attribute was previously set, override it.
1041	// Output should only have one attribute of each name.
1042	$newAttrs[$k] = [ $v, $origV, $origK ];
1043	}
1044
1045	# itemtype, itemid, itemref don't make sense without itemscope
1046	if ( !array_key_exists( 'itemscope', $newAttrs ) ) {
1047	// SSS FIXME: This logic is not RT-friendly.
1048	unset( $newAttrs['itemtype'] );
1049	unset( $newAttrs['itemid'] );
1050	unset( $newAttrs['itemref'] );
1051	}
1052	# TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
1053
1054	return $newAttrs;
1055	}
1056
1057	/**
1058	* Sanitize and apply attributes to a wrapper element.
1059	*
1060	* Used primarily when we're applying tokenized attributes directly to
1061	* dom elements, which wouldn't have had a chance to be sanitized before
1062	* tree building.
1063	* @param SiteConfig $siteConfig
1064	* @param Element $wrapper wrapper
1065	* @param array $attrs attributes
1066	*/
1067	public static function applySanitizedArgs(
1068	SiteConfig $siteConfig, Element $wrapper, array $attrs
1069	): void {
1070	$nodeName = DOMCompat::nodeName( $wrapper );
1071	$sanitizedAttrs = self::sanitizeTagAttrs( $siteConfig, $nodeName, null, $attrs );
1072	foreach ( $sanitizedAttrs as $k => $v ) {
1073	if ( isset( $v[0] ) ) {
1074	$wrapper->setAttribute( $k, $v[0] );
1075	}
1076	}
1077	}
1078
1079	/**
1080	* @param string $text
1081	* @return string
1082	*/
1083	public static function checkCss( string $text ): string {
1084	$text = self::normalizeCss( $text );
1085	// \000-\010\013\016-\037\177 are the octal escape sequences
1086	if ( preg_match( '/[\000-\010\013\016-\037\177]/', $text )
1087	\|\| strpos( $text, self::UTF8_REPLACEMENT ) !== false
1088	) {
1089	return '/* invalid control char */';
1090	} elseif ( preg_match( self::INSECURE_RE, $text ) ) {
1091	return '/* insecure input */';
1092	} else {
1093	return $text;
1094	}
1095	}
1096
1097	/**
1098	* @param array $matches
1099	* @return string
1100	*/
1101	public static function cssDecodeCallback( array $matches ): string {
1102	if ( $matches[1] !== '' ) {
1103	// Line continuation
1104	return '';
1105	} elseif ( $matches[2] !== '' ) {
1106	# hexdec could return a float if the match is too long, but the
1107	# regexp in question limits the string length to 6.
1108	$char = self::codepointToUtf8( hexdec( $matches[2] ) );
1109	} elseif ( $matches[3] !== '' ) {
1110	$char = $matches[3];
1111	} else {
1112	$char = '\\';
1113	}
1114	if ( $char == "\n" \|\| $char == '"' \|\| $char == "'" \|\| $char == '\\' ) {
1115	// These characters need to be escaped in strings
1116	// Clean up the escape sequence to avoid parsing errors by clients
1117	return '\\' . dechex( ord( $char ) ) . ' ';
1118	} else {
1119	// Decode unnecessary escape
1120	return $char;
1121	}
1122	}
1123
1124	/**
1125	* Sanitize a title to be used in a URI?
1126	* @param string $title
1127	* @param bool $isInterwiki
1128	* @return string
1129	*/
1130	public static function sanitizeTitleURI( string $title, bool $isInterwiki = false ): string {
1131	$idx = strpos( $title, '#' );
1132	$anchor = null;
1133	if ( $idx !== false ) { // split at first '#'
1134	$anchor = substr( $title, $idx + 1 );
1135	$title = substr( $title, 0, $idx );
1136	}
1137	$title = preg_replace_callback(
1138	'/[%? \[\]#\|<>]/', static function ( $matches ) {
1139	return PHPUtils::encodeURIComponent( $matches[0] );
1140	}, $title );
1141	if ( $anchor !== null ) {
1142	$title .= '#' . ( $isInterwiki
1143	? self::escapeIdForExternalInterwiki( $anchor )
1144	: self::escapeIdForLink( $anchor ) );
1145	}
1146	return $title;
1147	}
1148
1149	public const FIXTAGS = [
1150	# French spaces, last one Guillemet-left
1151	# only if it isn't followed by a word character.
1152	'/ (?=[?:;!%»›](?!\w))/u' => "%s",
1153	# French spaces, Guillemet-right
1154	'/([«‹]) /u' => "\\1%s",
1155	];
1156
1157	/**
1158	* Armor French spaces with a replacement character
1159	*
1160	* @since 1.32
1161	* @param string $text Text to armor
1162	* @param string $space Space character for the French spaces, defaults to ' '
1163	* @return string Armored text
1164	*/
1165	public static function armorFrenchSpaces( string $text, string $space = ' ' ): string {
1166	// Replace $ with \$ and \ with \\
1167	$space = preg_replace( '#(?<!\\\\)(\\$\|\\\\)#', '\\\\$1', $space );
1168	return preg_replace(
1169	array_keys( self::FIXTAGS ),
1170	array_map( static function ( string $replacement ) use ( $space ) {
1171	// @phan-suppress-next-line PhanPluginPrintfVariableFormatString
1172	return sprintf( $replacement, $space );
1173	}, array_values( self::FIXTAGS ) ),
1174	$text
1175	);
1176	}
1177
1178	/**
1179	* Given a section name or other user-generated or otherwise unsafe string, escapes it to be
1180	* a valid HTML id attribute.
1181	*
1182	* WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
1183	* be sure to use proper escaping.
1184	*
1185	* In Parsoid, proper escaping is usually handled for us by the HTML
1186	* serialization algorithm, but be careful of corner cases (such as
1187	* emitting attributes in wikitext).
1188	*
1189	* @param string $id String to escape
1190	* @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding
1191	* should be used.
1192	* @return string Escaped ID
1193	*
1194	* @since 1.30
1195	*/
1196	public static function escapeIdForAttribute( string $id, int $mode = self::ID_PRIMARY ): string {
1197	// For consistency with PHP's API, we accept "primary" or "fallback" as
1198	// the mode in 'options'. This (slightly) abstracts the actual details
1199	// of the id encoding from the Parsoid code which handles ids; we could
1200	// swap primary and fallback here, or even transition to a new HTML6
1201	// encoding (!), without touching all the call sites.
1202	$internalMode = $mode === self::ID_FALLBACK ? 'legacy' : 'html5';
1203	return self::escapeIdInternal( $id, $internalMode );
1204	}
1205
1206	/**
1207	* Given a section name or other user-generated or otherwise unsafe string, escapes it to be
1208	* a valid URL fragment.
1209	*
1210	* WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
1211	* be sure to use proper escaping.
1212	*
1213	* @param string $id String to escape
1214	* @return string Escaped ID
1215	*
1216	* @since 1.30
1217	*/
1218	public static function escapeIdForLink( string $id ): string {
1219	return self::escapeIdInternalUrl( $id, 'html5' );
1220	}
1221
1222	/**
1223	* Given a section name or other user-generated or otherwise unsafe string, escapes it to be
1224	* a valid URL fragment for external interwikis.
1225	*
1226	* @param string $id String to escape
1227	* @return string Escaped ID
1228	*
1229	* @since 1.30
1230	*/
1231	private static function escapeIdForExternalInterwiki( string $id ): string {
1232	// Assume $wgExternalInterwikiFragmentMode = 'legacy'
1233	return self::escapeIdInternalUrl( $id, 'legacy' );
1234	}
1235
1236	/**
1237	* Do percent encoding of percent signs for href (but not id) attributes
1238	*
1239	* @see https://phabricator.wikimedia.org/T238385
1240	* @param string $id String to escape
1241	* @param string $mode One of modes from $wgFragmentMode
1242	* @return string
1243	*/
1244	private static function escapeIdInternalUrl( string $id, string $mode ): string {
1245	$id = self::escapeIdInternal( $id, $mode );
1246	if ( $mode === 'html5' ) {
1247	$id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
1248	}
1249	return $id;
1250	}
1251
1252	/**
1253	* Helper for escapeIdFor*() functions. Performs most of the actual escaping.
1254	*
1255	* @param string $id String to escape
1256	* @param string $mode One of modes from $wgFragmentMode ('html5' or 'legacy')
1257	* @return string
1258	*/
1259	private static function escapeIdInternal( string $id, string $mode ): string {
1260	switch ( $mode ) {
1261	case 'html5':
1262	// html5 spec says ids must not have any of the following:
1263	// U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
1264	// In practice, in wikitext, only tab, LF, CR (and SPACE) are
1265	// possible using either Lua or html entities.
1266	$id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
1267	break;
1268
1269	case 'legacy':
1270	// This corresponds to 'noninitial' mode of the old escapeId
1271	static $replace = [
1272	'%3A' => ':',
1273	'%' => '.'
1274	];
1275
1276	$id = urlencode( str_replace( ' ', '_', $id ) );
1277	$id = strtr( $id, $replace );
1278	break;
1279
1280	default:
1281	throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1282	}
1283
1284	return $id;
1285	}
1286
1287	/**
1288	* Given a string containing a space delimited list of ids, escape each id
1289	* to match ids escaped by the escapeIdForAttribute() function.
1290	*
1291	* @since 1.27
1292	*
1293	* @param string $referenceString Space delimited list of ids
1294	* @return string
1295	*/
1296	public static function escapeIdReferenceList( string $referenceString ): string {
1297	# Explode the space delimited list string into an array of tokens
1298	$references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1299
1300	# Escape each token as an id
1301	foreach ( $references as &$ref ) {
1302	$ref = self::escapeIdForAttribute( $ref );
1303	}
1304
1305	# Merge the array back to a space delimited list string
1306	# If the array is empty, the result will be an empty string ('')
1307	$referenceString = implode( ' ', $references );
1308
1309	return $referenceString;
1310	}
1311
1312	/**
1313	* Normalizes whitespace in a section name, such as might be returned
1314	* by Parser::stripSectionName(), for use in the ids that are used for
1315	* section links.
1316	*
1317	* @param string $section
1318	* @return string
1319	*/
1320	public static function normalizeSectionNameWhiteSpace( string $section ): string {
1321	return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1322	}
1323	}