Code Coverage for /workspace/src/includes/parser/Sanitizer.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	63.28% covered (warning)	63.28%	398 / 629	64.58% covered (warning)	64.58%	31 / 48	CRAP	0.00% covered (danger)	0.00%	0 / 1
Sanitizer	63.38% covered (warning)	63.38%	398 / 628	64.58% covered (warning)	64.58%	31 / 48	1943.80	0.00% covered (danger)	0.00%	0 / 1
getAttribsRegex	18.18% covered (danger)	18.18%	2 / 11	0.00% covered (danger)	0.00%	0 / 1	4.19
getAttribNameRegex	40.00% covered (danger)	40.00%	2 / 5	0.00% covered (danger)	0.00%	0 / 1	2.86
getRecognizedTagData	40.00% covered (danger)	40.00%	24 / 60	0.00% covered (danger)	0.00%	0 / 1	21.82
internalRemoveHtmlTags	96.43% covered (success)	96.43%	27 / 28	0.00% covered (danger)	0.00%	0 / 1	12
removeSomeTags	100.00% covered (success)	100.00%	28 / 28	100.00% covered (success)	100.00%	1 / 1	1
removeHTMLcomments	11.76% covered (danger)	11.76%	2 / 17	0.00% covered (danger)	0.00%	0 / 1	51.96
validateTag	77.78% covered (warning)	77.78%	7 / 9	0.00% covered (danger)	0.00%	0 / 1	8.70
validateTagAttributes	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
validateAttributes	91.30% covered (success)	91.30%	42 / 46	0.00% covered (danger)	0.00%	0 / 1	36.85
isReservedDataAttribute	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
mergeAttributes	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	6
normalizeCss	55.56% covered (warning)	55.56%	10 / 18	0.00% covered (danger)	0.00%	0 / 1	5.40
checkCss	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	4
cssDecodeCallback	80.00% covered (warning)	80.00%	8 / 10	0.00% covered (danger)	0.00%	0 / 1	8.51
fixTagAttributes	85.71% covered (warning)	85.71%	6 / 7	0.00% covered (danger)	0.00%	0 / 1	3.03
encodeAttribute	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	1
armorFrenchSpaces	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	1
safeEncodeAttribute	100.00% covered (success)	100.00%	24 / 24	100.00% covered (success)	100.00%	1 / 1	1
escapeIdForAttribute	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	3
escapeIdForLink	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	2
escapeIdForExternalInterwiki	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
escapeIdInternalUrl	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	2
escapeIdInternal	100.00% covered (success)	100.00%	14 / 14	100.00% covered (success)	100.00%	1 / 1	4
escapeIdReferenceListInternal	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	2
escapeClass	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	2
escapeHtmlAllowEntities	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
decodeTagAttributes	100.00% covered (success)	100.00%	19 / 19	100.00% covered (success)	100.00%	1 / 1	5
safeEncodeTagAttributes	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	3
getTagAttributeCallback	88.89% covered (warning)	88.89%	8 / 9	0.00% covered (danger)	0.00%	0 / 1	5.03
normalizeWhitespace	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	1
normalizeSectionNameWhitespace	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
normalizeCharReferences	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	1
normalizeCharReferencesCallback	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	5
normalizeEntity	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	4
decCharReference	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	2
hexCharReference	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	3
validateCodepoint	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	10
decodeCharReferences	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	1
decodeCharReferencesAndNormalize	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	2
decodeCharReferencesCallback	90.00% covered (success)	90.00%	9 / 10	0.00% covered (danger)	0.00%	0 / 1	5.03
decodeChar	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	2
decodeEntity	75.00% covered (warning)	75.00%	3 / 4	0.00% covered (danger)	0.00%	0 / 1	2.06
attributesAllowedInternal	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
setupAttributesAllowedInternal	2.26% covered (danger)	2.26%	3 / 133	0.00% covered (danger)	0.00%	0 / 1	5.74
stripAllTags	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	1
hackDocType	0.00% covered (danger)	0.00%	0 / 11	0.00% covered (danger)	0.00%	0 / 1	20
cleanUrl	100.00% covered (success)	100.00%	14 / 14	100.00% covered (success)	100.00%	1 / 1	4
validateEmail	91.67% covered (success)	91.67%	11 / 12	0.00% covered (danger)	0.00%	0 / 1	2.00

1	<?php
2	/**
3	* HTML sanitizer for %MediaWiki.
4	*
5	* Copyright © 2002-2005 Brooke Vibber <bvibber@wikimedia.org> et al
6	* https://www.mediawiki.org/
7	*
8	* This program is free software; you can redistribute it and/or modify
9	* it under the terms of the GNU General Public License as published by
10	* the Free Software Foundation; either version 2 of the License, or
11	* (at your option) any later version.
12	*
13	* This program is distributed in the hope that it will be useful,
14	* but WITHOUT ANY WARRANTY; without even the implied warranty of
15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	* GNU General Public License for more details.
17	*
18	* You should have received a copy of the GNU General Public License along
19	* with this program; if not, write to the Free Software Foundation, Inc.,
20	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21	* http://www.gnu.org/copyleft/gpl.html
22	*
23	* @file
24	* @ingroup Parser
25	*/
26
27	namespace MediaWiki\Parser;
28
29	use InvalidArgumentException;
30	use LogicException;
31	use MediaWiki\HookContainer\HookRunner;
32	use MediaWiki\MediaWikiServices;
33	use MediaWiki\Tidy\RemexCompatFormatter;
34	use StringUtils;
35	use UnexpectedValueException;
36	use Wikimedia\RemexHtml\HTMLData;
37	use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
38	use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
39	use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
40	use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
41
42	/**
43	* HTML sanitizer for MediaWiki
44	* @ingroup Parser
45	*/
46	class Sanitizer {
47	/**
48	* Regular expression to match various types of character references in
49	* Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences.
50	* Note that HTML5 allows some named entities to omit the trailing
51	* semicolon; wikitext entities must have a trailing semicolon.
52	*/
53	private const CHAR_REFS_REGEX =
54	'/&([A-Za-z0-9\x80-\xff]+;)
55	\|&\#([0-9]+);
56	\|&\#[xX]([0-9A-Fa-f]+);
57	\|&/x';
58
59	/**
60	* Acceptable tag name charset from HTML5 parsing spec
61	* https://www.w3.org/TR/html5/syntax.html#tag-open-state
62	*/
63	private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]+)([^>]?)(/?>)([^<]*)$!';
64
65	/**
66	* Pattern matching evil uris like javascript:
67	* WARNING: DO NOT use this in any place that actually requires denying
68	* certain URIs for security reasons. There are NUMEROUS[1] ways to bypass
69	* pattern-based deny lists; the only way to be secure from javascript:
70	* uri based xss vectors is to allow only things that you know are safe
71	* and deny everything else.
72	* [1]: http://ha.ckers.org/xss.html
73	*/
74	private const EVIL_URI_PATTERN = '!(^\|\s\|\/\s)(javascript\|vbscript)([^\w]\|$)!i';
75	private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
76
77	/**
78	* Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
79	*
80	* @since 1.30
81	*/
82	public const ID_PRIMARY = 0;
83
84	/**
85	* Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
86	* if no fallback is configured.
87	*
88	* @since 1.30
89	*/
90	public const ID_FALLBACK = 1;
91
92	/**
93	* Character entity aliases accepted by MediaWiki in wikitext.
94	* These are not part of the HTML standard.
95	*/
96	private const MW_ENTITY_ALIASES = [
97	'רלמ;' => 'rlm;',
98	'رلم;' => 'rlm;',
99	];
100
101	/**
102	* Lazy-initialised attributes regex, see getAttribsRegex()
103	*/
104	private static ?string $attribsRegex = null;
105
106	/**
107	* Regular expression to match HTML/XML attribute pairs within a tag.
108	* Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
109	* Used in Sanitizer::decodeTagAttributes
110	*/
111	private static function getAttribsRegex(): string {
112	if ( self::$attribsRegex === null ) {
113	$spaceChars = '\x09\x0a\x0c\x0d\x20';
114	$space = "[{$spaceChars}]";
115	$attrib = "[^{$spaceChars}\/>=]";
116	$attribFirst = "(?:{$attrib}\|=)";
117	self::$attribsRegex =
118	"/({$attribFirst}{$attrib}*)
119	($space=$space
120	(?:
121	# The attribute value: quoted or alone
122	\"([^\"]*)(?:\"\|\$)
123	\| '([^']*)(?:'\|\$)
124	\| (((?!$space\|>).)*)
125	)
126	)?/sxu";
127	}
128	return self::$attribsRegex;
129	}
130
131	/**
132	* Lazy-initialised attribute name regex, see getAttribNameRegex()
133	*/
134	private static ?string $attribNameRegex = null;
135
136	/**
137	* Used in Sanitizer::decodeTagAttributes to filter attributes.
138	*/
139	private static function getAttribNameRegex(): string {
140	if ( self::$attribNameRegex === null ) {
141	$attribFirst = "[:_\p{L}\p{N}]";
142	$attrib = "[:_\.\-\p{L}\p{N}]";
143	self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
144	}
145	return self::$attribNameRegex;
146	}
147
148	/**
149	* Return the various lists of recognized tags
150	* @param string[] $extratags For any extra tags to include
151	* @param string[] $removetags For any tags (default or extra) to exclude
152	* @return array
153	* @internal
154	*/
155	public static function getRecognizedTagData( array $extratags = [], array $removetags = [] ): array {
156	static $commonCase, $staticInitialised = false;
157	$isCommonCase = ( $extratags === [] && $removetags === [] );
158	if ( $staticInitialised && $isCommonCase && $commonCase ) {
159	return $commonCase;
160	}
161
162	static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
163	$htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
164
165	if ( !$staticInitialised ) {
166	$htmlpairsStatic = [ # Tags that must be closed
167	'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
168	'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
169	'strike', 'strong', 'tt', 'var', 'div', 'center',
170	'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
171	'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
172	'kbd', 'samp', 'data', 'time', 'mark'
173	];
174	# These tags can be self-closed. For tags not also on
175	# $htmlsingleonly, a self-closed tag will be emitted as
176	# an empty element (open-tag/close-tag pair).
177	$htmlsingle = [
178	'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
179	];
180
181	# Elements that cannot have close tags. This is (not coincidentally)
182	# also the list of tags for which the HTML 5 parsing algorithm
183	# requires you to "acknowledge the token's self-closing flag", i.e.
184	# a self-closing tag like <br/> is not an HTML 5 parse error only
185	# for this list.
186	$htmlsingleonly = [
187	'br', 'wbr', 'hr', 'meta', 'link'
188	];
189
190	$htmlnest = [ # Tags that can be nested--??
191	'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
192	'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
193	'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
194	];
195	$tabletags = [ # Can only appear inside table, we will close them
196	'td', 'th', 'tr',
197	];
198	$htmllist = [ # Tags used by list
199	'ul', 'ol',
200	];
201	$listtags = [ # Tags that can appear in a list
202	'li',
203	];
204
205	$htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
206	$htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
207
208	# Convert them all to hashtables for faster lookup
209	$vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
210	'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
211	foreach ( $vars as $var ) {
212	$$var = array_fill_keys( $$var, true );
213	}
214	$staticInitialised = true;
215	}
216
217	# Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
218	$extratags = array_fill_keys( $extratags, true );
219	$removetags = array_fill_keys( $removetags, true );
220	$htmlpairs = array_merge( $extratags, $htmlpairsStatic );
221	$htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
222
223	$result = [
224	'htmlpairs' => $htmlpairs,
225	'htmlsingle' => $htmlsingle,
226	'htmlsingleonly' => $htmlsingleonly,
227	'htmlnest' => $htmlnest,
228	'tabletags' => $tabletags,
229	'htmllist' => $htmllist,
230	'listtags' => $listtags,
231	'htmlsingleallowed' => $htmlsingleallowed,
232	'htmlelements' => $htmlelements,
233	];
234	if ( $isCommonCase ) {
235	$commonCase = $result;
236	}
237	return $result;
238	}
239
240	/**
241	* Cleans up HTML, removes dangerous tags and attributes, and
242	* removes HTML comments; BEWARE there may be unmatched HTML
243	* tags in the result.
244	*
245	* @note Callers are recommended to use `::removeSomeTags()` instead
246	* of this method. `Sanitizer::removeSomeTags()` is safer and will
247	* always return well-formed HTML; however, it is significantly
248	* slower (especially for short strings where setup costs
249	* predominate). This method is for internal use by the legacy parser
250	* where we know the result will be cleaned up in a subsequent tidy pass.
251	*
252	* @param string $text Original string; see T268353 for why untainted.
253	* @param-taint $text none
254	* @param callable\|null $processCallback Callback to do any variable or
255	* parameter replacements in HTML attribute values.
256	* This argument should be considered @internal.
257	* @param-taint $processCallback exec_shell
258	* @param array\|bool $args Arguments for the processing callback
259	* @param-taint $args none
260	* @param array $extratags For any extra tags to include
261	* @param-taint $extratags tainted
262	* @param array $removetags For any tags (default or extra) to exclude
263	* @param-taint $removetags none
264	* @return string
265	* @return-taint escaped
266	* @internal
267	*/
268	public static function internalRemoveHtmlTags( string $text, ?callable $processCallback = null,
269	$args = [], array $extratags = [], array $removetags = []
270	): string {
271	$tagData = self::getRecognizedTagData( $extratags, $removetags );
272	$htmlsingle = $tagData['htmlsingle'];
273	$htmlsingleonly = $tagData['htmlsingleonly'];
274	$htmlelements = $tagData['htmlelements'];
275
276	# Remove HTML comments
277	$text = self::removeHTMLcomments( $text );
278	$bits = explode( '<', $text );
279	$text = str_replace( '>', '>', array_shift( $bits ) );
280
281	# this might be possible using remex tidy itself
282	foreach ( $bits as $x ) {
283	if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
284	[ /* $qbar */, $slash, $t, $params, $brace, $rest ] = $regs;
285
286	$badtag = false;
287	$t = strtolower( $t );
288	if ( isset( $htmlelements[$t] ) ) {
289	if ( is_callable( $processCallback ) ) {
290	call_user_func_array( $processCallback, [ &$params, $args ] );
291	}
292
293	if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) \|\| isset( $htmlsingleonly[$t] ) ) ) {
294	// Remove the self-closing slash, to be consistent
295	// with HTML5 semantics. T134423
296	$brace = '>';
297	}
298	if ( !self::validateTag( $params, $t ) ) {
299	$badtag = true;
300	}
301
302	$newparams = self::fixTagAttributes( $params, $t );
303	if ( !$badtag ) {
304	if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
305	# Interpret self-closing tags as empty tags even when
306	# HTML 5 would interpret them as start tags. Such input
307	# is commonly seen on Wikimedia wikis with this intention.
308	$brace = "></$t>";
309	}
310
311	$rest = str_replace( '>', '>', $rest );
312	$text .= "<$slash$t$newparams$brace$rest";
313	continue;
314	}
315	}
316	}
317	$text .= '<' . str_replace( '>', '>', $x );
318	}
319	return $text;
320	}
321
322	/**
323	* Cleans up HTML, removes dangerous tags and attributes, and
324	* removes HTML comments; the result will always be balanced and
325	* tidy HTML.
326	* @param string $text Source string; see T268353 for why untainted
327	* @param-taint $text none
328	* @param array $options Options controlling the cleanup:
329	* string[] $options['extraTags'] Any extra tags to allow
330	* (This property taints the whole array.)
331	* string[] $options['removeTags'] Any tags (default or extra) to exclude
332	* callable(Attributes,...):Attributes $options['attrCallback'] Callback
333	* to do any variable or parameter replacements in HTML attribute
334	* values before further cleanup; should be considered @internal
335	* and not for external use.
336	* array $options['attrCallbackArgs'] Additional arguments for the
337	* attribute callback
338	* @param-taint $options tainted
339	* @return string The cleaned up HTML
340	* @return-taint escaped
341	* @since 1.38
342	*/
343	public static function removeSomeTags(
344	string $text, array $options = []
345	): string {
346	$extraTags = $options['extraTags'] ?? [];
347	$removeTags = $options['removeTags'] ?? [];
348	// These options are @internal:
349	$attrCallback = $options['attrCallback'] ?? null;
350	$attrCallbackArgs = $options['attrCallbackArgs'] ?? [];
351
352	// This disallows HTML5-style "missing trailing semicolon" attributes
353	// In wikitext "clean&copy" does not contain an entity.
354	$text = self::normalizeCharReferences( $text );
355
356	$tagData = self::getRecognizedTagData( $extraTags, $removeTags );
357	// Use RemexHtml to tokenize $text and remove the barred tags
358	$formatter = new RemexCompatFormatter;
359	$serializer = new RemexSerializer( $formatter );
360	$treeBuilder = new RemexTreeBuilder( $serializer, [
361	'ignoreErrors' => true,
362	'ignoreNulls' => true,
363	] );
364	$dispatcher = new RemexDispatcher( $treeBuilder );
365	$tokenHandler = $dispatcher;
366	$remover = new RemexRemoveTagHandler(
367	$tokenHandler, $text, $tagData,
368	$attrCallback, $attrCallbackArgs
369	);
370	$tokenizer = new RemexTokenizer( $remover, $text, [
371	'ignoreErrors' => true,
372	// don't ignore char refs, we want them to be decoded
373	'ignoreNulls' => true,
374	'skipPreprocess' => true,
375	] );
376	$tokenizer->execute( [
377	'fragmentNamespace' => HTMLData::NS_HTML,
378	'fragmentName' => 'body',
379	] );
380	return $serializer->getResult();
381	}
382
383	/**
384	* Remove '<!--', '-->', and everything between.
385	* To avoid leaving blank lines, when a comment is both preceded
386	* and followed by a newline (ignoring spaces), trim leading and
387	* trailing spaces and one of the newlines.
388	*/
389	public static function removeHTMLcomments( string $text ): string {
390	// phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition
391	while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
392	$end = strpos( $text, '-->', $start + 4 );
393	if ( $end === false ) {
394	# Unterminated comment; bail out
395	break;
396	}
397
398	$end += 3;
399
400	# Trim space and newline if the comment is both
401	# preceded and followed by a newline
402	$spaceStart = max( $start - 1, 0 );
403	$spaceLen = $end - $spaceStart;
404	while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
405	$spaceStart--;
406	$spaceLen++;
407	}
408	while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
409	$spaceLen++;
410	}
411	if ( substr( $text, $spaceStart, 1 ) === "\n"
412	&& substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
413	# Remove the comment, leading and trailing
414	# spaces, and leave only one newline.
415	$text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
416	} else {
417	# Remove just the comment.
418	$text = substr_replace( $text, '', $start, $end - $start );
419	}
420	}
421	return $text;
422	}
423
424	/**
425	* Takes attribute names and values for a tag and the tag name and
426	* validates that the tag is allowed to be present.
427	* This DOES NOT validate the attributes, nor does it validate the
428	* tags themselves. This method only handles the special circumstances
429	* where we may want to allow a tag within content but ONLY when it has
430	* specific attributes set.
431	*
432	* @see RemexRemoveTagHandler::validateTag()
433	*/
434	private static function validateTag( string $params, string $element ): bool {
435	$params = self::decodeTagAttributes( $params );
436
437	if ( $element == 'meta' \|\| $element == 'link' ) {
438	if ( !isset( $params['itemprop'] ) ) {
439	// <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
440	return false;
441	}
442	if ( $element == 'meta' && !isset( $params['content'] ) ) {
443	// <meta> must have a content="" for the itemprop
444	return false;
445	}
446	if ( $element == 'link' && !isset( $params['href'] ) ) {
447	// <link> must have an associated href=""
448	return false;
449	}
450	}
451
452	return true;
453	}
454
455	/**
456	* Take an array of attribute names and values and normalize or discard
457	* illegal values for the given element type.
458	*
459	* - Discards attributes not allowed for the given element
460	* - Unsafe style attributes are discarded
461	* - Invalid id attributes are re-encoded
462	*
463	* @todo Check for legal values where the DTD limits things.
464	* @todo Check for unique id attribute :P
465	*/
466	public static function validateTagAttributes( array $attribs, string $element ): array {
467	return self::validateAttributes( $attribs,
468	self::attributesAllowedInternal( $element ) );
469	}
470
471	/**
472	* Take an array of attribute names and values and normalize or discard
473	* illegal values.
474	*
475	* - Discards attributes not on the given list
476	* - Unsafe style attributes are discarded
477	* - Invalid id attributes are re-encoded
478	*
479	* @param array $attribs
480	* @param array $allowed List of allowed attribute names,
481	* as an associative array where keys give valid attribute names
482	* (since 1.34). Before 1.35, passing a sequential array of
483	* valid attribute names was permitted but that is now deprecated.
484	* @return array
485	*
486	* @todo Check for legal values where the DTD limits things.
487	* @todo Check for unique id attribute :P
488	*/
489	public static function validateAttributes( array $attribs, array $allowed ): array {
490	if ( isset( $allowed[0] ) ) {
491	// Calling this function with a sequential array is
492	// deprecated. For now just convert it.
493	wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
494	$allowed = array_fill_keys( $allowed, true );
495	}
496	$validProtocols = MediaWikiServices::getInstance()->getUrlUtils()->validProtocols();
497	$hrefExp = '/^(' . $validProtocols . ')[^\s]+$/';
498
499	$out = [];
500	foreach ( $attribs as $attribute => $value ) {
501	# Allow XML namespace declaration to allow RDFa
502	if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
503	if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
504	$out[$attribute] = $value;
505	}
506
507	continue;
508	}
509
510	# Allow any attribute beginning with "data-"
511	# However:
512	# * Disallow data attributes used by MediaWiki code
513	# * Ensure that the attribute is not namespaced by banning
514	# colons.
515	if ( (
516	!preg_match( '/^data-[^:]*$/i', $attribute ) &&
517	!array_key_exists( $attribute, $allowed )
518	) \|\| self::isReservedDataAttribute( $attribute ) ) {
519	continue;
520	}
521
522	# Strip javascript "expression" from stylesheets.
523	# https://msdn.microsoft.com/en-us/library/ms537634.aspx
524	if ( $attribute == 'style' ) {
525	$value = self::checkCss( $value );
526	}
527
528	# Escape HTML id attributes
529	if ( $attribute === 'id' ) {
530	$value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
531	if ( $value === false \|\| $value === '' ) {
532	continue;
533	}
534	}
535
536	# Escape HTML id reference lists
537	if ( $attribute === 'aria-describedby'
538	\|\| $attribute === 'aria-flowto'
539	\|\| $attribute === 'aria-labelledby'
540	\|\| $attribute === 'aria-owns'
541	) {
542	$value = self::escapeIdReferenceListInternal( $value );
543	}
544
545	// RDFa and microdata properties allow URLs, URIs and/or CURIs.
546	if ( $attribute === 'rel' \|\| $attribute === 'rev'
547	# RDFa
548	\|\| $attribute === 'about' \|\| $attribute === 'property'
549	\|\| $attribute === 'resource' \|\| $attribute === 'datatype'
550	\|\| $attribute === 'typeof'
551	# HTML5 microdata
552	\|\| $attribute === 'itemid' \|\| $attribute === 'itemprop'
553	\|\| $attribute === 'itemref' \|\| $attribute === 'itemscope'
554	\|\| $attribute === 'itemtype'
555	) {
556	// Paranoia. Allow "simple" values but suppress javascript
557	if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
558	continue;
559	}
560	}
561
562	# NOTE: even though elements using href/src are not allowed directly, supply
563	# validation code that can be used by tag hook handlers, etc
564	if ( $attribute === 'href' \|\| $attribute === 'src' \|\| $attribute === 'poster' ) {
565	if ( !preg_match( $hrefExp, $value ) ) {
566	continue; // drop any href or src attributes not using an allowed protocol.
567	// NOTE: this also drops all relative URLs
568	}
569	}
570
571	if ( $attribute === 'tabindex' && $value !== '0' ) {
572	// Only allow tabindex of 0, which is useful for accessibility.
573	continue;
574	}
575
576	// If this attribute was previously set, override it.
577	// Output should only have one attribute of each name.
578	$out[$attribute] = $value;
579	}
580
581	# itemtype, itemid, itemref don't make sense without itemscope
582	if ( !array_key_exists( 'itemscope', $out ) ) {
583	unset( $out['itemtype'] );
584	unset( $out['itemid'] );
585	unset( $out['itemref'] );
586	}
587	# TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
588
589	return $out;
590	}
591
592	/**
593	* Given an attribute name, checks whether it is a reserved data attribute
594	* (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki
595	* core and extension code can safely use it to communicate with frontend code.
596	* @param string $attr Attribute name.
597	* @return bool
598	*/
599	public static function isReservedDataAttribute( string $attr ): bool {
600	// data-ooui is reserved for ooui.
601	// data-mw and data-parsoid are reserved for parsoid.
602	// data-mw-<name here> is reserved for extensions (or core) if
603	// they need to communicate some data to the client and want to be
604	// sure that it isn't coming from an untrusted user.
605	// We ignore the possibility of namespaces since user-generated HTML
606	// can't use them anymore.
607	return (bool)preg_match( '/^data-(ooui\|mw\|parsoid)/i', $attr );
608	}
609
610	/**
611	* Merge two sets of HTML attributes. Conflicting items in the second set
612	* will override those in the first, except for 'class' attributes which
613	* will be combined (if they're both strings).
614	*
615	* @todo implement merging for other attributes such as style
616	*/
617	public static function mergeAttributes( array $a, array $b ): array {
618	$out = array_merge( $a, $b );
619	if ( isset( $a['class'] ) && isset( $b['class'] )
620	&& is_string( $a['class'] ) && is_string( $b['class'] )
621	&& $a['class'] !== $b['class']
622	) {
623	$classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
624	-1, PREG_SPLIT_NO_EMPTY );
625	$out['class'] = implode( ' ', array_unique( $classes ) );
626	}
627	return $out;
628	}
629
630	/**
631	* Normalize CSS into a format we can easily search for hostile input
632	* - decode character references
633	* - decode escape sequences
634	* - remove comments, unless the entire value is one single comment
635	* @param string $value the css string
636	* @return string normalized css
637	*/
638	public static function normalizeCss( string $value ): string {
639	// Decode character references like {
640	$value = self::decodeCharReferences( $value );
641
642	// Decode escape sequences and line continuation
643	// See the grammar in the CSS 2 spec, appendix D.
644	// This has to be done AFTER decoding character references.
645	// This means it isn't possible for this function to return
646	// unsanitized escape sequences. It is possible to manufacture
647	// input that contains character references that decode to
648	// escape sequences that decode to character references, but
649	// it's OK for the return value to contain character references
650	// because the caller is supposed to escape those anyway.
651	static $decodeRegex;
652	if ( !$decodeRegex ) {
653	$space = '[\\x20\\t\\r\\n\\f]';
654	$nl = '(?:\\n\|\\r\\n\|\\r\|\\f)';
655	$backslash = '\\\\';
656	$decodeRegex = "/ $backslash
657	(?:
658	($nl) \| # 1. Line continuation
659	([0-9A-Fa-f]{1,6})$space? \| # 2. character number
660	(.) \| # 3. backslash cancelling special meaning
661	() \| # 4. backslash at end of string
662	)/xu";
663	}
664	$value = preg_replace_callback( $decodeRegex,
665	[ __CLASS__, 'cssDecodeCallback' ], $value );
666
667	// Let the value through if it's nothing but a single comment, to
668	// allow other functions which may reject it to pass some error
669	// message through.
670	if ( !preg_match( '! ^ \s* /\* [^\\/] \/ \s $ !x', $value ) ) {
671	// Remove any comments; IE gets token splitting wrong
672	// This must be done AFTER decoding character references and
673	// escape sequences, because those steps can introduce comments
674	// This step cannot introduce character references or escape
675	// sequences, because it replaces comments with spaces rather
676	// than removing them completely.
677	$value = StringUtils::delimiterReplace( '/', '/', ' ', $value );
678
679	// Remove anything after a comment-start token, to guard against
680	// incorrect client implementations.
681	$commentPos = strpos( $value, '/*' );
682	if ( $commentPos !== false ) {
683	$value = substr( $value, 0, $commentPos );
684	}
685	}
686
687	return $value;
688	}
689
690	/**
691	* Pick apart some CSS and check it for forbidden or unsafe structures.
692	* Returns a sanitized string. This sanitized string will have
693	* character references and escape sequences decoded and comments
694	* stripped (unless it is itself one valid comment, in which case the value
695	* will be passed through). If the input is just too evil, only a comment
696	* complaining about evilness will be returned.
697	*
698	* Currently URL references, 'expression', 'tps' are forbidden.
699	*
700	* NOTE: Despite the fact that character references are decoded, the
701	* returned string may contain character references given certain
702	* clever input strings. These character references must
703	* be escaped before the return value is embedded in HTML.
704	*
705	* @warning This method is intended to sanitize style attributes on
706	* html tags only. It is not safe to use on full CSS files.
707	* @param string $value
708	* @return string
709	*/
710	public static function checkCss( $value ) {
711	$value = self::normalizeCss( $value );
712
713	// Reject problematic keywords and control characters
714	if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) \|\|
715	strpos( $value, \UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
716	return '/* invalid control char */';
717	} elseif ( preg_match(
718	'! expression
719	\| accelerator\s*:
720	\| -o-link\s*:
721	\| -o-link-source\s*:
722	\| -o-replace\s*:
723	\| url\s*\(
724	\| src\s*\(
725	\| image\s*\(
726	\| image-set\s*\(
727	\| attr\s*\([^)]+[\s,]+url
728	!ix', $value ) ) {
729	return '/* insecure input */';
730	}
731	return $value;
732	}
733
734	private static function cssDecodeCallback( array $matches ): string {
735	if ( $matches[1] !== '' ) {
736	// Line continuation
737	return '';
738	} elseif ( $matches[2] !== '' ) {
739	# hexdec could return a float if the match is too long, but the
740	# regexp in question limits the string length to 6.
741	$char = \UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
742	} elseif ( $matches[3] !== '' ) {
743	$char = $matches[3];
744	} else {
745	$char = '\\';
746	}
747	if ( $char == "\n" \|\| $char == '"' \|\| $char == "'" \|\| $char == '\\' ) {
748	// These characters need to be escaped in strings
749	// Clean up the escape sequence to avoid parsing errors by clients
750	return '\\' . dechex( ord( $char ) ) . ' ';
751	} else {
752	// Decode unnecessary escape
753	return $char;
754	}
755	}
756
757	/**
758	* Take a tag soup fragment listing an HTML element's attributes
759	* and normalize it to well-formed XML, discarding unwanted attributes.
760	* Output is safe for further wikitext processing, with escaping of
761	* values that could trigger problems.
762	*
763	* - Normalizes attribute names to lowercase
764	* - Discards attributes not allowed for the given element
765	* - Turns broken or invalid entities into plaintext
766	* - Double-quotes all attribute values
767	* - Attributes without values are given the name as attribute
768	* - Double attributes are discarded
769	* - Unsafe style attributes are discarded
770	* - Prepends space if there are attributes.
771	* - (Optionally) Sorts attributes by name.
772	*
773	* @param string $text
774	* @param string $element
775	* @param bool $sorted Whether to sort the attributes (default: false)
776	* @return string
777	*/
778	public static function fixTagAttributes( string $text, string $element, bool $sorted = false ): string {
779	if ( trim( $text ) == '' ) {
780	return '';
781	}
782
783	$decoded = self::decodeTagAttributes( $text );
784	$stripped = self::validateTagAttributes( $decoded, $element );
785
786	if ( $sorted ) {
787	ksort( $stripped );
788	}
789
790	return self::safeEncodeTagAttributes( $stripped );
791	}
792
793	/**
794	* Encode an attribute value for HTML output.
795	* @param string $text
796	* @param-taint $text escapes_html
797	* @return string HTML-encoded text fragment
798	* @return-taint escaped
799	*/
800	public static function encodeAttribute( string $text ): string {
801	$encValue = htmlspecialchars( $text, ENT_QUOTES );
802
803	// Whitespace is normalized during attribute decoding,
804	// so if we've been passed non-spaces we must encode them
805	// ahead of time or they won't be preserved.
806	$encValue = strtr( $encValue, [
807	"\n" => ' ',
808	"\r" => ' ',
809	"\t" => ' ',
810	] );
811
812	return $encValue;
813	}
814
815	/**
816	* Armor French spaces with a replacement character
817	*
818	* @since 1.32
819	* @param string $text Text to armor
820	* @param string $space Space character for the French spaces, defaults to ' '
821	* @return string Armored text
822	*/
823	public static function armorFrenchSpaces( string $text, string $space = ' ' ): string {
824	// Replace $ with \$ and \ with \\
825	$space = preg_replace( '#(?<!\\\\)(\\$\|\\\\)#', '\\\\$1', $space );
826	$fixtags = [
827	# French spaces, last one Guillemet-left
828	# only if it isn't followed by a word character.
829	'/ (?=[?:;!%»›](?!\w))/u' => "$space",
830	# French spaces, Guillemet-right
831	'/([«‹]) /u' => "\\1$space",
832	];
833	return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
834	}
835
836	/**
837	* Encode an attribute value for HTML tags, with extra armoring
838	* against further wiki processing.
839	* @param string $text
840	* @param-taint $text escapes_html
841	* @return string HTML-encoded text fragment
842	* @return-taint escaped
843	*/
844	public static function safeEncodeAttribute( string $text ): string {
845	$encValue = self::encodeAttribute( $text );
846
847	# Templates and links may be expanded in later parsing,
848	# creating invalid or dangerous output. Suppress this.
849	$encValue = strtr( $encValue, [
850	// '<', '>', and '"' should never happen, as they indicate that we've received invalid input which should
851	// have been escaped.
852	'<' => '<',
853	'>' => '>',
854	'"' => '"',
855	'{' => '{',
856	'}' => '}', // prevent unpaired language conversion syntax
857	'[' => '[',
858	']' => ']',
859	"''" => '''',
860	'ISBN' => 'ISBN',
861	'RFC' => 'RFC',
862	'PMID' => 'PMID',
863	'\|' => '\|',
864	'__' => '__',
865	] );
866
867	# Stupid hack
868	$validProtocols = MediaWikiServices::getInstance()->getUrlUtils()->validProtocols();
869	$encValue = preg_replace_callback(
870	'/((?i)' . $validProtocols . ')/',
871	static function ( $matches ) {
872	return str_replace( ':', ':', $matches[1] );
873	},
874	$encValue );
875	return $encValue;
876	}
877
878	/**
879	* Given a section name or other user-generated or otherwise unsafe string, escapes it to be
880	* a valid HTML id attribute.
881	*
882	* WARNING: The output of this function is not guaranteed to be HTML safe, so be sure to use
883	* proper escaping.
884	*
885	* @param string $id String to escape
886	* @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding
887	* should be used.
888	* @return string\|false Escaped ID or false if fallback encoding is requested but it's not
889	* configured.
890	*
891	* @since 1.30
892	*/
893	public static function escapeIdForAttribute( string $id, int $mode = self::ID_PRIMARY ) {
894	global $wgFragmentMode;
895
896	if ( !isset( $wgFragmentMode[$mode] ) ) {
897	if ( $mode === self::ID_PRIMARY ) {
898	throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
899	}
900	return false;
901	}
902
903	$internalMode = $wgFragmentMode[$mode];
904
905	return self::escapeIdInternal( $id, $internalMode );
906	}
907
908	/**
909	* Given a section name or other user-generated or otherwise unsafe string, escapes it to be
910	* a valid URL fragment.
911	*
912	* WARNING: The output of this function is not guaranteed to be HTML safe, so be sure to use
913	* proper escaping.
914	*
915	* @param string $id String to escape
916	* @return string Escaped ID
917	*
918	* @since 1.30
919	*/
920	public static function escapeIdForLink( string $id ): string {
921	global $wgFragmentMode;
922
923	if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
924	throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
925	}
926
927	$mode = $wgFragmentMode[self::ID_PRIMARY];
928
929	$id = self::escapeIdInternalUrl( $id, $mode );
930
931	return $id;
932	}
933
934	/**
935	* Given a section name or other user-generated or otherwise unsafe string, escapes it to be
936	* a valid URL fragment for external interwikis.
937	*
938	* @param string $id String to escape
939	* @return string Escaped ID
940	*
941	* @since 1.30
942	*/
943	public static function escapeIdForExternalInterwiki( string $id ): string {
944	global $wgExternalInterwikiFragmentMode;
945
946	$id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode );
947
948	return $id;
949	}
950
951	/**
952	* Do percent encoding of percent signs for href (but not id) attributes
953	*
954	* @since 1.35
955	* @see https://phabricator.wikimedia.org/T238385
956	* @param string $id String to escape
957	* @param string $mode One of modes from $wgFragmentMode
958	* @return string
959	*/
960	private static function escapeIdInternalUrl( string $id, string $mode ): string {
961	$id = self::escapeIdInternal( $id, $mode );
962	if ( $mode === 'html5' ) {
963	$id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
964	}
965	return $id;
966	}
967
968	/**
969	* Helper for escapeIdFor*() functions. Performs most of the actual escaping.
970	*
971	* @param string $id String to escape
972	* @param string $mode One of modes from $wgFragmentMode
973	* @return string
974	*/
975	private static function escapeIdInternal( string $id, string $mode ): string {
976	// Truncate overly-long IDs. This isn't an HTML limit, it's just
977	// griefer protection. [T251506]
978	$id = mb_substr( $id, 0, 1024 );
979
980	switch ( $mode ) {
981	case 'html5':
982	// html5 spec says ids must not have any of the following:
983	// U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
984	// In practice, in wikitext, only tab, LF, CR (and SPACE) are
985	// possible using either Lua or html entities.
986	$id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
987	break;
988	case 'legacy':
989	// This corresponds to 'noninitial' mode of the former escapeId()
990	static $replace = [
991	'%3A' => ':',
992	'%' => '.'
993	];
994
995	$id = urlencode( str_replace( ' ', '_', $id ) );
996	$id = strtr( $id, $replace );
997	break;
998	default:
999	throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1000	}
1001
1002	return $id;
1003	}
1004
1005	/**
1006	* Given a string containing a space delimited list of ids, escape each id
1007	* to match ids escaped by the escapeIdForAttribute() function.
1008	*
1009	* @param string $referenceString Space delimited list of ids
1010	* @return string
1011	*/
1012	private static function escapeIdReferenceListInternal( string $referenceString ): string {
1013	# Explode the space delimited list string into an array of tokens
1014	$references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1015
1016	# Escape each token as an id
1017	foreach ( $references as &$ref ) {
1018	$ref = self::escapeIdForAttribute( $ref );
1019	}
1020
1021	# Merge the array back to a space delimited list string
1022	# If the array is empty, the result will be an empty string ('')
1023	$referenceString = implode( ' ', $references );
1024
1025	return $referenceString;
1026	}
1027
1028	/**
1029	* Given a value, escape it so that it can be used as a CSS class and
1030	* return it.
1031	*
1032	* @todo For extra validity, input should be validated UTF-8.
1033	*
1034	* @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format
1035	*/
1036	public static function escapeClass( string $class ): string {
1037	// Convert ugly stuff to underscores and kill underscores in ugly places
1038	return rtrim( preg_replace(
1039	[ '/(^[0-9\\-])\|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{\|}~]\|\\xC2\\xA0/', '/_+/' ],
1040	'_',
1041	$class ), '_' );
1042	}
1043
1044	/**
1045	* Given HTML input, escape with htmlspecialchars but un-escape entities.
1046	* This allows (generally harmless) entities like   to survive.
1047	*
1048	* @param string $html HTML to escape
1049	* @param-taint $html escapes_htmlnoent
1050	* @return string Escaped input
1051	* @return-taint escaped
1052	*/
1053	public static function escapeHtmlAllowEntities( string $html ): string {
1054	$html = self::decodeCharReferences( $html );
1055	# It seems wise to escape ' as well as ", as a matter of course. Can't
1056	# hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1057	# don't cause the entire string to disappear.
1058	$html = htmlspecialchars( $html, ENT_QUOTES \| ENT_SUBSTITUTE );
1059	return $html;
1060	}
1061
1062	/**
1063	* Return an associative array of attribute names and values from
1064	* a partial tag string. Attribute names are forced to lowercase,
1065	* character references are decoded to UTF-8 text.
1066	*/
1067	public static function decodeTagAttributes( string $text ): array {
1068	if ( trim( $text ) == '' ) {
1069	return [];
1070	}
1071
1072	$pairs = [];
1073	if ( !preg_match_all(
1074	self::getAttribsRegex(),
1075	$text,
1076	$pairs,
1077	PREG_SET_ORDER ) ) {
1078	return [];
1079	}
1080
1081	$attribs = [];
1082	foreach ( $pairs as $set ) {
1083	$attribute = strtolower( $set[1] );
1084
1085	// Filter attribute names with unacceptable characters
1086	if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1087	continue;
1088	}
1089
1090	$value = self::getTagAttributeCallback( $set );
1091
1092	// Normalize whitespace
1093	$value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1094	$value = trim( $value );
1095
1096	// Decode character references
1097	$attribs[$attribute] = self::decodeCharReferences( $value );
1098	}
1099	return $attribs;
1100	}
1101
1102	/**
1103	* Build a partial tag string from an associative array of attribute
1104	* names and values as returned by decodeTagAttributes.
1105	*/
1106	public static function safeEncodeTagAttributes( array $assoc_array ): string {
1107	$attribs = [];
1108	foreach ( $assoc_array as $attribute => $value ) {
1109	$encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1110	$encValue = self::safeEncodeAttribute( $value );
1111
1112	$attribs[] = "$encAttribute=\"$encValue\"";
1113	}
1114	return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1115	}
1116
1117	/**
1118	* Pick the appropriate attribute value from a match set from the
1119	* attribs regex matches.
1120	*/
1121	private static function getTagAttributeCallback( array $set ): string {
1122	if ( isset( $set[5] ) ) {
1123	# No quotes.
1124	return $set[5];
1125	} elseif ( isset( $set[4] ) ) {
1126	# Single-quoted
1127	return $set[4];
1128	} elseif ( isset( $set[3] ) ) {
1129	# Double-quoted
1130	return $set[3];
1131	} elseif ( !isset( $set[2] ) ) {
1132	# In XHTML, attributes must have a value so return an empty string.
1133	# See "Empty attribute syntax",
1134	# https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1135	return "";
1136	} else {
1137	throw new LogicException( "Tag conditions not met. This should never happen and is a bug." );
1138	}
1139	}
1140
1141	private static function normalizeWhitespace( string $text ): string {
1142	return trim( preg_replace(
1143	'/(?:\r\n\|[\x20\x0d\x0a\x09])+/',
1144	' ',
1145	$text ) );
1146	}
1147
1148	/**
1149	* Normalizes whitespace in a section name, such as might be returned
1150	* by Parser::stripSectionName(), for use in the id's that are used for
1151	* section links.
1152	*/
1153	public static function normalizeSectionNameWhitespace( string $section ): string {
1154	return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1155	}
1156
1157	/**
1158	* Ensure that any entities and character references are legal
1159	* for XML and XHTML specifically. Any stray bits will be
1160	* &-escaped to result in a valid text fragment.
1161	*
1162	* a. named char refs can only be < > & ", others are
1163	* numericized (this way we're well-formed even without a DTD)
1164	* b. any numeric char refs must be legal chars, not invalid or forbidden
1165	* c. use lower cased "&#x", not "&#X"
1166	* d. fix or reject non-valid attributes
1167	*
1168	* @internal
1169	*/
1170	public static function normalizeCharReferences( string $text ): string {
1171	return preg_replace_callback(
1172	self::CHAR_REFS_REGEX,
1173	[ self::class, 'normalizeCharReferencesCallback' ],
1174	$text, -1, $count, PREG_UNMATCHED_AS_NULL
1175	);
1176	}
1177
1178	private static function normalizeCharReferencesCallback( array $matches ): string {
1179	$ret = null;
1180	if ( isset( $matches[1] ) ) {
1181	$ret = self::normalizeEntity( $matches[1] );
1182	} elseif ( isset( $matches[2] ) ) {
1183	$ret = self::decCharReference( $matches[2] );
1184	} elseif ( isset( $matches[3] ) ) {
1185	$ret = self::hexCharReference( $matches[3] );
1186	}
1187	if ( $ret === null ) {
1188	return htmlspecialchars( $matches[0], ENT_COMPAT );
1189	} else {
1190	return $ret;
1191	}
1192	}
1193
1194	/**
1195	* If the named entity is defined in HTML5
1196	* return the equivalent numeric entity reference (except for the core <
1197	* > & "). If the entity is a MediaWiki-specific alias, returns
1198	* the HTML equivalent. Otherwise, returns HTML-escaped text of
1199	* pseudo-entity source (eg &foo;)
1200	*
1201	* @param string $name Semicolon-terminated name
1202	* @return string
1203	*/
1204	private static function normalizeEntity( string $name ): string {
1205	if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1206	// Non-standard MediaWiki-specific entities
1207	return '&' . self::MW_ENTITY_ALIASES[$name];
1208	} elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
1209	// Keep these in word form
1210	return "&$name";
1211	} elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1212	// Beware: some entities expand to more than 1 codepoint
1213	return preg_replace_callback( '/./Ssu', static function ( $m ) {
1214	return '&#' . \UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
1215	}, HTMLData::$namedEntityTranslations[$name] );
1216	} else {
1217	return "&$name";
1218	}
1219	}
1220
1221	private static function decCharReference( string $codepoint ): ?string {
1222	# intval() will (safely) saturate at the maximum signed integer
1223	# value if $codepoint is too many digits
1224	$point = intval( $codepoint );
1225	if ( self::validateCodepoint( $point ) ) {
1226	return "&#$point;";
1227	} else {
1228	return null;
1229	}
1230	}
1231
1232	private static function hexCharReference( string $codepoint ): ?string {
1233	$point = hexdec( $codepoint );
1234	// hexdec() might return a float if the string is too long
1235	if ( is_int( $point ) && self::validateCodepoint( $point ) ) {
1236	return sprintf( '&#x%x;', $point );
1237	} else {
1238	return null;
1239	}
1240	}
1241
1242	/**
1243	* Returns true if a given Unicode codepoint is a valid character in
1244	* both HTML5 and XML.
1245	*/
1246	private static function validateCodepoint( int $codepoint ): bool {
1247	# U+000C is valid in HTML5 but not allowed in XML.
1248	# U+000D is valid in XML but not allowed in HTML5.
1249	# U+007F - U+009F are disallowed in HTML5 (control characters).
1250	return $codepoint == 0x09
1251	\|\| $codepoint == 0x0a
1252	\|\| ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1253	\|\| ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1254	\|\| ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1255	\|\| ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1256	}
1257
1258	/**
1259	* Decode any character references, numeric or named entities,
1260	* in the text and return a UTF-8 string.
1261	*/
1262	public static function decodeCharReferences( string $text ): string {
1263	return preg_replace_callback(
1264	self::CHAR_REFS_REGEX,
1265	[ self::class, 'decodeCharReferencesCallback' ],
1266	$text, -1, $count, PREG_UNMATCHED_AS_NULL
1267	);
1268	}
1269
1270	/**
1271	* Decode any character references, numeric or named entities,
1272	* in the next and normalize the resulting string. (T16952)
1273	*
1274	* This is useful for page titles, not for text to be displayed,
1275	* MediaWiki allows HTML entities to escape normalization as a feature.
1276	*
1277	* @param string $text Already normalized, containing entities
1278	* @return string Still normalized, without entities
1279	*/
1280	public static function decodeCharReferencesAndNormalize( string $text ): string {
1281	$text = preg_replace_callback(
1282	self::CHAR_REFS_REGEX,
1283	[ self::class, 'decodeCharReferencesCallback' ],
1284	$text, -1, $count, PREG_UNMATCHED_AS_NULL
1285	);
1286
1287	if ( $count ) {
1288	return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1289	} else {
1290	return $text;
1291	}
1292	}
1293
1294	private static function decodeCharReferencesCallback( array $matches ): string {
1295	if ( isset( $matches[1] ) ) {
1296	return self::decodeEntity( $matches[1] );
1297	} elseif ( isset( $matches[2] ) ) {
1298	return self::decodeChar( intval( $matches[2] ) );
1299	} elseif ( isset( $matches[3] ) ) {
1300	$point = hexdec( $matches[3] );
1301	// hexdec() might return a float if the string is too long
1302	if ( !is_int( $point ) ) {
1303	// Invalid character reference.
1304	return \UtfNormal\Constants::UTF8_REPLACEMENT;
1305	}
1306	return self::decodeChar( $point );
1307	}
1308	# Last case should be an ampersand by itself
1309	return $matches[0];
1310	}
1311
1312	/**
1313	* Return UTF-8 string for a codepoint if that is a valid
1314	* character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1315	* @internal
1316	*/
1317	private static function decodeChar( int $codepoint ): string {
1318	if ( self::validateCodepoint( $codepoint ) ) {
1319	return \UtfNormal\Utils::codepointToUtf8( $codepoint );
1320	} else {
1321	return \UtfNormal\Constants::UTF8_REPLACEMENT;
1322	}
1323	}
1324
1325	/**
1326	* If the named entity is defined in HTML5
1327	* return the UTF-8 encoding of that character. Otherwise, returns
1328	* pseudo-entity source (eg "&foo;")
1329	*
1330	* @param string $name Semicolon-terminated entity name
1331	* @return string
1332	*/
1333	private static function decodeEntity( string $name ): string {
1334	// These are MediaWiki-specific entities, not in the HTML standard
1335	if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1336	$name = self::MW_ENTITY_ALIASES[$name];
1337	}
1338	$trans = HTMLData::$namedEntityTranslations[$name] ?? null;
1339	return $trans ?? "&$name";
1340	}
1341
1342	/**
1343	* Fetch the list of acceptable attributes for a given element name.
1344	*
1345	* @param string $element
1346	* @return array An associative array where keys are acceptable attribute
1347	* names
1348	*/
1349	private static function attributesAllowedInternal( string $element ): array {
1350	$list = self::setupAttributesAllowedInternal();
1351	return $list[$element] ?? [];
1352	}
1353
1354	/**
1355	* Foreach array key (an allowed HTML element), return an array
1356	* of allowed attributes.
1357	* @return array An associative array: keys are HTML element names;
1358	* values are associative arrays where the keys are allowed attribute
1359	* names.
1360	*/
1361	private static function setupAttributesAllowedInternal(): array {
1362	static $allowed;
1363
1364	if ( $allowed !== null ) {
1365	return $allowed;
1366	}
1367
1368	// For lookup efficiency flip each attributes array so the keys are
1369	// the valid attributes.
1370	$merge = static function ( $a, $b, $c = [] ) {
1371	return array_merge(
1372	$a,
1373	array_fill_keys( $b, true ),
1374	array_fill_keys( $c, true ) );
1375	};
1376	$common = $merge( [], [
1377	# HTML
1378	'id',
1379	'class',
1380	'style',
1381	'lang',
1382	'dir',
1383	'title',
1384	'tabindex',
1385
1386	# WAI-ARIA
1387	'aria-describedby',
1388	'aria-flowto',
1389	'aria-hidden',
1390	'aria-label',
1391	'aria-labelledby',
1392	'aria-level',
1393	'aria-owns',
1394	'role',
1395
1396	# RDFa
1397	# These attributes are specified in section 9 of
1398	# https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1399	'about',
1400	'property',
1401	'resource',
1402	'datatype',
1403	'typeof',
1404
1405	# Microdata. These are specified by
1406	# https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1407	'itemid',
1408	'itemprop',
1409	'itemref',
1410	'itemscope',
1411	'itemtype',
1412	] );
1413
1414	$block = $merge( $common, [ 'align' ] );
1415
1416	$tablealign = [ 'align', 'valign' ];
1417	$tablecell = [
1418	'abbr',
1419	'axis',
1420	'headers',
1421	'scope',
1422	'rowspan',
1423	'colspan',
1424	'nowrap', # deprecated
1425	'width', # deprecated
1426	'height', # deprecated
1427	'bgcolor', # deprecated
1428	];
1429
1430	# Numbers refer to sections in HTML 4.01 standard describing the element.
1431	# See: https://www.w3.org/TR/html4/
1432	$allowed = [
1433	# 7.5.4
1434	'div' => $block,
1435	'center' => $common, # deprecated
1436	'span' => $common,
1437
1438	# 7.5.5
1439	'h1' => $block,
1440	'h2' => $block,
1441	'h3' => $block,
1442	'h4' => $block,
1443	'h5' => $block,
1444	'h6' => $block,
1445
1446	# 7.5.6
1447	# address
1448
1449	# 8.2.4
1450	'bdo' => $common,
1451
1452	# 9.2.1
1453	'em' => $common,
1454	'strong' => $common,
1455	'cite' => $common,
1456	'dfn' => $common,
1457	'code' => $common,
1458	'samp' => $common,
1459	'kbd' => $common,
1460	'var' => $common,
1461	'abbr' => $common,
1462	# acronym
1463
1464	# 9.2.2
1465	'blockquote' => $merge( $common, [ 'cite' ] ),
1466	'q' => $merge( $common, [ 'cite' ] ),
1467
1468	# 9.2.3
1469	'sub' => $common,
1470	'sup' => $common,
1471
1472	# 9.3.1
1473	'p' => $block,
1474
1475	# 9.3.2
1476	'br' => $merge( $common, [ 'clear' ] ),
1477
1478	# https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1479	'wbr' => $common,
1480
1481	# 9.3.4
1482	'pre' => $merge( $common, [ 'width' ] ),
1483
1484	# 9.4
1485	'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1486	'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1487
1488	# 10.2
1489	'ul' => $merge( $common, [ 'type' ] ),
1490	'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1491	'li' => $merge( $common, [ 'type', 'value' ] ),
1492
1493	# 10.3
1494	'dl' => $common,
1495	'dd' => $common,
1496	'dt' => $common,
1497
1498	# 11.2.1
1499	'table' => $merge( $common,
1500	[ 'summary', 'width', 'border', 'frame',
1501	'rules', 'cellspacing', 'cellpadding',
1502	'align', 'bgcolor',
1503	] ),
1504
1505	# 11.2.2
1506	'caption' => $block,
1507
1508	# 11.2.3
1509	'thead' => $common,
1510	'tfoot' => $common,
1511	'tbody' => $common,
1512
1513	# 11.2.4
1514	'colgroup' => $merge( $common, [ 'span' ] ),
1515	'col' => $merge( $common, [ 'span' ] ),
1516
1517	# 11.2.5
1518	'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1519
1520	# 11.2.6
1521	'td' => $merge( $common, $tablecell, $tablealign ),
1522	'th' => $merge( $common, $tablecell, $tablealign ),
1523
1524	# 12.2
1525	# NOTE: <a> is not allowed directly, but this list of allowed
1526	# attributes is used from the Parser object
1527	'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1528
1529	# 13.2
1530	# Not usually allowed, but may be used for extension-style hooks
1531	# such as <math> when it is rasterized
1532	'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1533	# Attributes for A/V tags added in T163583 / T133673
1534	'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1535	'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1536	'source' => $merge( $common, [ 'type', 'src' ] ),
1537	'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1538
1539	# 15.2.1
1540	'tt' => $common,
1541	'b' => $common,
1542	'i' => $common,
1543	'big' => $common,
1544	'small' => $common,
1545	'strike' => $common,
1546	's' => $common,
1547	'u' => $common,
1548
1549	# 15.2.2
1550	'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1551	# basefont
1552
1553	# 15.3
1554	'hr' => $merge( $common, [ 'width' ] ),
1555
1556	# HTML Ruby annotation text module, simple ruby only.
1557	# https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1558	'ruby' => $common,
1559	# rbc
1560	'rb' => $common,
1561	'rp' => $common,
1562	'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1563	'rtc' => $common,
1564
1565	# MathML root element, where used for extensions
1566	# 'title' may not be 100% valid here; it's XHTML
1567	# https://www.w3.org/TR/REC-MathML/
1568	'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1569
1570	// HTML 5 section 4.5
1571	'figure' => $common,
1572	'figcaption' => $common,
1573
1574	# HTML 5 section 4.6
1575	'bdi' => $common,
1576
1577	# HTML5 elements, defined by:
1578	# https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1579	'data' => $merge( $common, [ 'value' ] ),
1580	'time' => $merge( $common, [ 'datetime' ] ),
1581	'mark' => $common,
1582
1583	// meta and link are only permitted by internalRemoveHtmlTags when Microdata
1584	// is enabled so we don't bother adding a conditional to hide these
1585	// Also meta and link are only valid in WikiText as Microdata elements
1586	// (ie: validateTag rejects tags missing the attributes needed for Microdata)
1587	// So we don't bother including $common attributes that have no purpose.
1588	'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1589	'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1590
1591	# HTML 5 section 4.3.5
1592	'aside' => $common,
1593	];
1594
1595	return $allowed;
1596	}
1597
1598	/**
1599	* Take a fragment of (potentially invalid) HTML and return
1600	* a version with any tags removed, encoded as plain text.
1601	*
1602	* Warning: this return value must be further escaped for literal
1603	* inclusion in HTML output as of 1.10!
1604	*
1605	* @param string $html HTML fragment
1606	* @return string
1607	* @return-taint tainted
1608	*/
1609	public static function stripAllTags( string $html ): string {
1610	// Use RemexHtml to tokenize $html and extract the text
1611	$handler = new RemexStripTagHandler;
1612	$tokenizer = new RemexTokenizer( $handler, $html, [
1613	'ignoreErrors' => true,
1614	// don't ignore char refs, we want them to be decoded
1615	'ignoreNulls' => true,
1616	'skipPreprocess' => true,
1617	] );
1618	$tokenizer->execute();
1619	$text = $handler->getResult();
1620
1621	$text = self::normalizeWhitespace( $text );
1622	return $text;
1623	}
1624
1625	/**
1626	* Hack up a private DOCTYPE with HTML's standard entity declarations.
1627	* PHP 4 seemed to know these if you gave it an HTML doctype, but
1628	* PHP 5.1 doesn't.
1629	*
1630	* Use for passing XHTML fragments to PHP's XML parsing functions
1631	*
1632	* @deprecated since 1.36; will be made private or removed in a future
1633	* release.
1634	*/
1635	public static function hackDocType(): string {
1636	$out = "<!DOCTYPE html [\n";
1637	foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1638	if ( substr( $entity, -1 ) !== ';' ) {
1639	// Some HTML entities omit the trailing semicolon;
1640	// wikitext does not permit these.
1641	continue;
1642	}
1643	$name = substr( $entity, 0, -1 );
1644	$expansion = self::normalizeEntity( $entity );
1645	if ( $entity === $expansion ) {
1646	// Skip < > etc
1647	continue;
1648	}
1649	$out .= "<!ENTITY $name \"$expansion\">";
1650	}
1651	$out .= "]>\n";
1652	return $out;
1653	}
1654
1655	public static function cleanUrl( string $url ): string {
1656	# Normalize any HTML entities in input. They will be
1657	# re-escaped by makeExternalLink().
1658	$url = self::decodeCharReferences( $url );
1659
1660	# Escape any control characters introduced by the above step
1661	$url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\\|]+/',
1662	static fn ( $m ) => urlencode( $m[0] ), $url );
1663
1664	# Validate hostname portion
1665	$matches = [];
1666	if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1667	[ /* $whole */, $protocol, $host, $rest ] = $matches;
1668
1669	// Characters that will be ignored in IDNs.
1670	// https://datatracker.ietf.org/doc/html/rfc8264#section-9.13
1671	// https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
1672	// Strip them before further processing so deny lists and such work.
1673	$strip = "/
1674	\\s\| # general whitespace
1675	\u{00AD}\| # SOFT HYPHEN
1676	\u{034F}\| # COMBINING GRAPHEME JOINER
1677	\u{061C}\| # ARABIC LETTER MARK
1678	[\u{115F}-\u{1160}]\| # HANGUL CHOSEONG FILLER..
1679	# HANGUL JUNGSEONG FILLER
1680	[\u{17B4}-\u{17B5}]\| # KHMER VOWEL INHERENT AQ..
1681	# KHMER VOWEL INHERENT AA
1682	[\u{180B}-\u{180D}]\| # MONGOLIAN FREE VARIATION SELECTOR ONE..
1683	# MONGOLIAN FREE VARIATION SELECTOR THREE
1684	\u{180E}\| # MONGOLIAN VOWEL SEPARATOR
1685	[\u{200B}-\u{200F}]\| # ZERO WIDTH SPACE..
1686	# RIGHT-TO-LEFT MARK
1687	[\u{202A}-\u{202E}]\| # LEFT-TO-RIGHT EMBEDDING..
1688	# RIGHT-TO-LEFT OVERRIDE
1689	[\u{2060}-\u{2064}]\| # WORD JOINER..
1690	# INVISIBLE PLUS
1691	\u{2065}\| # <reserved-2065>
1692	[\u{2066}-\u{206F}]\| # LEFT-TO-RIGHT ISOLATE..
1693	# NOMINAL DIGIT SHAPES
1694	\u{3164}\| # HANGUL FILLER
1695	[\u{FE00}-\u{FE0F}]\| # VARIATION SELECTOR-1..
1696	# VARIATION SELECTOR-16
1697	\u{FEFF}\| # ZERO WIDTH NO-BREAK SPACE
1698	\u{FFA0}\| # HALFWIDTH HANGUL FILLER
1699	[\u{FFF0}-\u{FFF8}]\| # <reserved-FFF0>..
1700	# <reserved-FFF8>
1701	[\u{1BCA0}-\u{1BCA3}]\| # SHORTHAND FORMAT LETTER OVERLAP..
1702	# SHORTHAND FORMAT UP STEP
1703	[\u{1D173}-\u{1D17A}]\| # MUSICAL SYMBOL BEGIN BEAM..
1704	# MUSICAL SYMBOL END PHRASE
1705	\u{E0000}\| # <reserved-E0000>
1706	\u{E0001}\| # LANGUAGE TAG
1707	[\u{E0002}-\u{E001F}]\| # <reserved-E0002>..
1708	# <reserved-E001F>
1709	[\u{E0020}-\u{E007F}]\| # TAG SPACE..
1710	# CANCEL TAG
1711	[\u{E0080}-\u{E00FF}]\| # <reserved-E0080>..
1712	# <reserved-E00FF>
1713	[\u{E0100}-\u{E01EF}]\| # VARIATION SELECTOR-17..
1714	# VARIATION SELECTOR-256
1715	[\u{E01F0}-\u{E0FFF}]\| # <reserved-E01F0>..
1716	# <reserved-E0FFF>
1717	/xuD";
1718
1719	$host = preg_replace( $strip, '', $host );
1720
1721	// IPv6 host names are bracketed with []. Url-decode these.
1722	if ( str_starts_with( $host, "//%5B" ) &&
1723	preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1724	) {
1725	$host = '//[' . $matches[1] . ']' . $matches[2];
1726	}
1727
1728	// @todo FIXME: Validate hostnames here
1729
1730	return $protocol . $host . $rest;
1731	} else {
1732	return $url;
1733	}
1734	}
1735
1736	/**
1737	* Does a string look like an e-mail address?
1738	*
1739	* This validates an email address using an HTML5 specification found at:
1740	* http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address
1741	* Which as of 2011-01-24 says:
1742	*
1743	* A valid e-mail address is a string that matches the ABNF production
1744	* 1( atext / "." ) "@" ldh-str ( "." ldh-str ) where atext is defined
1745	* in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section
1746	* 3.5.
1747	*
1748	* This function is an implementation of the specification as requested in
1749	* T24449.
1750	*
1751	* Client-side forms will use the same standard validation rules via JS or
1752	* HTML 5 validation; additional restrictions can be enforced server-side
1753	* by extensions via the 'isValidEmailAddr' hook.
1754	*
1755	* Note that this validation doesn't 100% match RFC 2822, but is believed
1756	* to be liberal enough for wide use. Some invalid addresses will still
1757	* pass validation here.
1758	*
1759	* @since 1.18
1760	*
1761	* @param string $addr E-mail address
1762	* @return bool
1763	*/
1764	public static function validateEmail( string $addr ): bool {
1765	$result = null;
1766	// TODO This method should be non-static, and have a HookRunner injected
1767	$hookRunner = new HookRunner( MediaWikiServices::getInstance()->getHookContainer() );
1768	if ( !$hookRunner->onIsValidEmailAddr( $addr, $result ) ) {
1769	return $result;
1770	}
1771
1772	// Please note strings below are enclosed in brackets [], this make the
1773	// hyphen "-" a range indicator. Hence it is double backslashed below.
1774	// See T28948
1775	$rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{\|}~";
1776	$rfc1034_ldh_str = "a-z0-9\\-";
1777
1778	$html5_email_regexp = "/
1779	^ # start of string
1780	[$rfc5322_atext\\.]+ # user part which is liberal :p
1781	@ # 'apostrophe'
1782	[$rfc1034_ldh_str]+ # First domain part
1783	(\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1784	$ # End of string
1785	/ix"; // case Insensitive, eXtended
1786
1787	return (bool)preg_match( $html5_email_regexp, $addr );
1788	}
1789	}
1790
1791	/** @deprecated class alias since 1.41 */
1792	class_alias( Sanitizer::class, 'Sanitizer' );