29use InvalidArgumentException;
35use UnexpectedValueException;
36use Wikimedia\RemexHtml\HTMLData;
37use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
38use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
39use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
40use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
53 private const CHAR_REFS_REGEX =
54 '/&([A-Za-z0-9\x80-\xff]+;)
56 |&\#[xX]([0-9A-Fa-f]+);
63 private const ELEMENT_BITS_REGEX =
'!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
74 private const EVIL_URI_PATTERN =
'!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
75 private const XMLNS_ATTRIBUTE_PATTERN =
"/^xmlns:[:A-Z_a-z-.0-9]+$/";
96 private const MW_ENTITY_ALIASES = [
104 private static $attribsRegex;
112 private static function getAttribsRegex() {
113 if ( self::$attribsRegex ===
null ) {
114 $spaceChars =
'\x09\x0a\x0c\x0d\x20';
115 $space =
"[{$spaceChars}]";
116 $attrib =
"[^{$spaceChars}\/>=]";
117 $attribFirst =
"(?:{$attrib}|=)";
118 self::$attribsRegex =
119 "/({$attribFirst}{$attrib}*)
122 # The attribute value: quoted or alone
129 return self::$attribsRegex;
135 private static $attribNameRegex;
141 private static function getAttribNameRegex() {
142 if ( self::$attribNameRegex ===
null ) {
143 $attribFirst =
"[:_\p{L}\p{N}]";
144 $attrib =
"[:_\.\-\p{L}\p{N}]";
145 self::$attribNameRegex =
"/^({$attribFirst}{$attrib}*)$/sxu";
147 return self::$attribNameRegex;
159 static $commonCase, $staticInitialised;
160 $isCommonCase = ( $extratags === [] && $removetags === [] );
161 if ( $staticInitialised ===
$wgAllowImageTag && $isCommonCase && $commonCase ) {
165 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
166 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
171 if ( !$staticInitialised || $staticInitialised !== $globalContext ) {
172 $htmlpairsStatic = [ # Tags that must be closed
173 'b',
'bdi',
'del',
'i',
'ins',
'u',
'font',
'big',
'small',
'sub',
'sup',
'h1',
174 'h2',
'h3',
'h4',
'h5',
'h6',
'cite',
'code',
'em',
's',
175 'strike',
'strong',
'tt',
'var',
'div',
'center',
176 'blockquote',
'ol',
'ul',
'dl',
'table',
'caption',
'pre',
177 'ruby',
'rb',
'rp',
'rt',
'rtc',
'p',
'span',
'abbr',
'dfn',
178 'kbd',
'samp',
'data',
'time',
'mark'
180 # These tags can be self-closed. For tags not also on
181 # $htmlsingleonly, a self-closed tag will be emitted as
182 # an empty element (open-tag/close-tag pair).
184 'br',
'wbr',
'hr',
'li',
'dt',
'dd',
'meta',
'link'
187 # Elements that cannot have close tags. This is (not coincidentally)
188 # also the list of tags for which the HTML 5 parsing algorithm
189 # requires you to "acknowledge the token's self-closing flag", i.e.
190 # a self-closing tag like <br/> is not an HTML 5 parse error only
193 'br',
'wbr',
'hr',
'meta',
'link'
196 $htmlnest = [ # Tags that can be nested--??
197 'table',
'tr',
'td',
'th',
'div',
'blockquote',
'ol',
'ul',
198 'li',
'dl',
'dt',
'dd',
'font',
'big',
'small',
'sub',
'sup',
'span',
199 'var',
'kbd',
'samp',
'em',
'strong',
'q',
'ruby',
'bdo'
201 $tabletags = [ # Can only appear inside table, we will close them
204 $htmllist = [ # Tags used by list
207 $listtags = [ # Tags that can appear in a list
213 'is deprecated since MediaWiki 1.35',
'1.35',
false,
false );
214 $htmlsingle[] =
'img';
215 $htmlsingleonly[] =
'img';
218 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
219 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
221 # Convert them all to hashtables for faster lookup
222 $vars = [
'htmlpairsStatic',
'htmlsingle',
'htmlsingleonly',
'htmlnest',
'tabletags',
223 'htmllist',
'listtags',
'htmlsingleallowed',
'htmlelementsStatic' ];
224 foreach ( $vars as $var ) {
225 $$var = array_fill_keys( $$var,
true );
227 $staticInitialised = $globalContext;
230 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
231 $extratags = array_fill_keys( $extratags,
true );
232 $removetags = array_fill_keys( $removetags,
true );
234 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
236 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
239 'htmlpairs' => $htmlpairs,
240 'htmlsingle' => $htmlsingle,
241 'htmlsingleonly' => $htmlsingleonly,
242 'htmlnest' => $htmlnest,
243 'tabletags' => $tabletags,
244 'htmllist' => $htmllist,
245 'listtags' => $listtags,
246 'htmlsingleallowed' => $htmlsingleallowed,
247 'htmlelements' => $htmlelements,
249 if ( $isCommonCase ) {
250 $commonCase = $result;
286 $args = [], $extratags = [], $removetags = []
290 $text, $processCallback, $args, $extratags, $removetags
323 $args = [], $extratags = [], $removetags = []
326 $htmlsingle = $tagData[
'htmlsingle'];
327 $htmlsingleonly = $tagData[
'htmlsingleonly'];
328 $htmlelements = $tagData[
'htmlelements'];
330 # Remove HTML comments
332 $bits = explode(
'<', $text );
333 $text = str_replace(
'>',
'>', array_shift( $bits ) );
335 # this might be possible using remex tidy itself
336 foreach ( $bits as $x ) {
337 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
338 [ , $slash, $t, $params, $brace, $rest ] = $regs;
341 $t = strtolower( $t );
342 if ( isset( $htmlelements[$t] ) ) {
343 if ( is_callable( $processCallback ) ) {
344 call_user_func_array( $processCallback, [ &$params, $args ] );
347 if ( $brace ==
'/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
352 if ( !self::validateTag( $params, $t ) ) {
358 if ( $brace ===
'/>' && !isset( $htmlsingleonly[$t] ) ) {
359 # Interpret self-closing tags as empty tags even when
360 # HTML 5 would interpret them as start tags. Such input
361 # is commonly seen on Wikimedia wikis with this intention.
365 $rest = str_replace(
'>',
'>', $rest );
366 $text .=
"<$slash$t$newparams$brace$rest";
371 $text .=
'<' . str_replace(
'>',
'>', $x );
398 string $text, array $options = []
400 $extraTags = $options[
'extraTags'] ?? [];
401 $removeTags = $options[
'removeTags'] ?? [];
403 $attrCallback = $options[
'attrCallback'] ??
null;
404 $attrCallbackArgs = $options[
'attrCallbackArgs'] ?? [];
413 $serializer =
new RemexSerializer( $formatter );
414 $treeBuilder =
new RemexTreeBuilder( $serializer, [
415 'ignoreErrors' =>
true,
416 'ignoreNulls' =>
true,
418 $dispatcher =
new RemexDispatcher( $treeBuilder );
419 $tokenHandler = $dispatcher;
421 $tokenHandler, $text, $tagData,
422 $attrCallback, $attrCallbackArgs
424 $tokenizer =
new RemexTokenizer( $remover, $text, [
425 'ignoreErrors' =>
true,
427 'ignoreNulls' =>
true,
428 'skipPreprocess' =>
true,
430 $tokenizer->execute( [
431 'fragmentNamespace' => HTMLData::NS_HTML,
432 'fragmentName' =>
'body',
434 return $serializer->getResult();
447 while ( ( $start = strpos( $text,
'<!--' ) ) !==
false ) {
448 $end = strpos( $text,
'-->', $start + 4 );
449 if ( $end ===
false ) {
450 # Unterminated comment; bail out
456 # Trim space and newline if the comment is both
457 # preceded and followed by a newline
458 $spaceStart = max( $start - 1, 0 );
459 $spaceLen = $end - $spaceStart;
460 while ( substr( $text, $spaceStart, 1 ) ===
' ' && $spaceStart > 0 ) {
464 while ( substr( $text, $spaceStart + $spaceLen, 1 ) ===
' ' ) {
467 if ( substr( $text, $spaceStart, 1 ) ===
"\n"
468 && substr( $text, $spaceStart + $spaceLen, 1 ) ===
"\n" ) {
469 # Remove the comment, leading and trailing
470 # spaces, and leave only one newline.
471 $text = substr_replace( $text,
"\n", $spaceStart, $spaceLen + 1 );
473 # Remove just the comment.
474 $text = substr_replace( $text,
'', $start, $end - $start );
494 private static function validateTag( $params, $element ) {
495 $params = self::decodeTagAttributes( $params );
497 if ( $element ==
'meta' || $element ==
'link' ) {
498 if ( !isset( $params[
'itemprop'] ) ) {
502 if ( $element ==
'meta' && !isset( $params[
'content'] ) ) {
506 if ( $element ==
'link' && !isset( $params[
'href'] ) ) {
531 return self::validateAttributes( $attribs,
532 self::attributesAllowedInternal( $element ) );
554 if ( isset( $allowed[0] ) ) {
557 wfDeprecated( __METHOD__ .
' with sequential array',
'1.35' );
558 $allowed = array_fill_keys( $allowed,
true );
563 foreach ( $attribs as $attribute => $value ) {
564 # Allow XML namespace declaration to allow RDFa
565 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
566 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
567 $out[$attribute] = $value;
573 # Allow any attribute beginning with "data-"
575 # * Disallow data attributes used by MediaWiki code
576 # * Ensure that the attribute is not namespaced by banning
579 !preg_match(
'/^data-[^:]*$/i', $attribute ) &&
580 !array_key_exists( $attribute, $allowed )
581 ) || self::isReservedDataAttribute( $attribute ) ) {
585 # Strip javascript "expression" from stylesheets.
587 if ( $attribute ==
'style' ) {
588 $value = self::checkCss( $value );
591 # Escape HTML id attributes
592 if ( $attribute ===
'id' ) {
593 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
596 # Escape HTML id reference lists
597 if ( $attribute ===
'aria-describedby'
598 || $attribute ===
'aria-flowto'
599 || $attribute ===
'aria-labelledby'
600 || $attribute ===
'aria-owns'
602 $value = self::escapeIdReferenceListInternal( $value );
606 if ( $attribute ===
'rel' || $attribute ===
'rev'
608 || $attribute ===
'about' || $attribute ===
'property'
609 || $attribute ===
'resource' || $attribute ===
'datatype'
610 || $attribute ===
'typeof'
612 || $attribute ===
'itemid' || $attribute ===
'itemprop'
613 || $attribute ===
'itemref' || $attribute ===
'itemscope'
614 || $attribute ===
'itemtype'
617 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
622 # NOTE: even though elements using href/src are not allowed directly, supply
623 # validation code that can be used by tag hook handlers, etc
624 if ( $attribute ===
'href' || $attribute ===
'src' || $attribute ===
'poster' ) {
625 if ( !preg_match( $hrefExp, $value ) ) {
631 if ( $attribute ===
'tabindex' && $value !==
'0' ) {
638 $out[$attribute] = $value;
641 # itemtype, itemid, itemref don't make sense without itemscope
642 if ( !array_key_exists(
'itemscope', $out ) ) {
643 unset( $out[
'itemtype'] );
644 unset( $out[
'itemid'] );
645 unset( $out[
'itemref'] );
647 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
667 return (
bool)preg_match(
'/^data-(ooui|mw|parsoid)/i', $attr );
681 $out = array_merge( $a, $b );
682 if ( isset( $a[
'class'] ) && isset( $b[
'class'] )
683 && is_string( $a[
'class'] ) && is_string( $b[
'class'] )
684 && $a[
'class'] !== $b[
'class']
686 $classes = preg_split(
'/\s+/',
"{$a['class']} {$b['class']}",
687 -1, PREG_SPLIT_NO_EMPTY );
688 $out[
'class'] = implode(
' ', array_unique( $classes ) );
703 $value = self::decodeCharReferences( $value );
715 if ( !$decodeRegex ) {
716 $space =
'[\\x20\\t\\r\\n\\f]';
717 $nl =
'(?:\\n|\\r\\n|\\r|\\f)';
719 $decodeRegex =
"/ $backslash
721 ($nl) | # 1. Line continuation
722 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
723 (.) | # 3. backslash cancelling special meaning
724 () | # 4. backslash at end of string
727 $value = preg_replace_callback( $decodeRegex,
728 [ __CLASS__,
'cssDecodeCallback' ], $value );
733 if ( !preg_match(
'! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
740 $value = StringUtils::delimiterReplace(
'/*',
'*/',
' ', $value );
744 $commentPos = strpos( $value,
'/*' );
745 if ( $commentPos !==
false ) {
746 $value = substr( $value, 0, $commentPos );
772 $value = self::normalizeCss( $value );
775 if ( preg_match(
'/[\000-\010\013\016-\037\177]/', $value ) ||
776 strpos( $value, \UtfNormal\Constants::UTF8_REPLACEMENT ) !==
false ) {
777 return '/* invalid control char */';
778 } elseif ( preg_match(
788 | attr\s*\([^)]+[\s,]+url
790 return '/* insecure input */';
799 private static function cssDecodeCallback(
$matches ) {
804 # hexdec could return a float if the match is too long, but the
805 # regexp in question limits the string length to 6.
806 $char = \UtfNormal\Utils::codepointToUtf8( hexdec(
$matches[2] ) );
812 if ( $char ==
"\n" || $char ==
'"' || $char ==
"'" || $char ==
'\\' ) {
815 return '\\' . dechex( ord( $char ) ) .
' ';
844 if ( trim( $text ) ==
'' ) {
848 $decoded = self::decodeTagAttributes( $text );
849 $stripped = self::validateTagAttributes( $decoded, $element );
855 return self::safeEncodeTagAttributes( $stripped );
866 $encValue = htmlspecialchars( $text, ENT_QUOTES );
871 $encValue = strtr( $encValue, [
892 # French spaces, last one Guillemet-left
893 # only if it isn't followed by a word character.
894 '/ (?=[?:;!%»›](?!\w))/u' =>
"$space",
895 # French spaces, Guillemet-right
896 '/([«‹]) /u' =>
"\\1$space",
898 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
910 $encValue = self::encodeAttribute( $text );
912 # Templates and links may be expanded in later parsing,
913 # creating invalid or dangerous output. Suppress this.
914 $encValue = strtr( $encValue, [
924 "''" =>
'''',
925 'ISBN' =>
'ISBN',
927 'PMID' =>
'PMID',
933 $encValue = preg_replace_callback(
936 return str_replace(
':',
':',
$matches[1] );
961 if ( $mode === self::ID_PRIMARY ) {
962 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
969 return self::escapeIdInternal( $id, $internalMode );
988 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
993 $id = self::escapeIdInternalUrl( $id, $mode );
1024 private static function escapeIdInternalUrl( $id, $mode ) {
1025 $id = self::escapeIdInternal( $id, $mode );
1026 if ( $mode ===
'html5' ) {
1027 $id = preg_replace(
'/%([a-fA-F0-9]{2})/',
'%25$1', $id );
1039 private static function escapeIdInternal( $id, $mode ) {
1042 $id = mb_substr( $id, 0, 1024 );
1050 $id = str_replace( [
"\t",
"\n",
"\f",
"\r",
" " ],
'_', $id );
1059 $id = urlencode( str_replace(
' ',
'_', $id ) );
1060 $id = strtr( $id, $replace );
1063 throw new InvalidArgumentException(
"Invalid mode '$mode' passed to '" . __METHOD__ );
1076 private static function escapeIdReferenceListInternal( $referenceString ) {
1077 # Explode the space delimited list string into an array of tokens
1078 $references = preg_split(
'/\s+/',
"{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1080 # Escape each token as an id
1081 foreach ( $references as &$ref ) {
1082 $ref = self::escapeIdForAttribute( $ref );
1085 # Merge the array back to a space delimited list string
1086 # If the array is empty, the result will be an empty string ('')
1087 $referenceString = implode(
' ', $references );
1089 return $referenceString;
1105 return rtrim( preg_replace(
1106 [
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/' ],
1121 $html = self::decodeCharReferences( $html );
1122 # It seems wise to escape ' as well as ", as a matter of course. Can't
1123 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1124 # don't cause the entire string to disappear.
1125 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1137 public static function decodeTagAttributes( $text ) {
1138 if ( trim( $text ) == '' ) {
1143 if ( !preg_match_all(
1144 self::getAttribsRegex(),
1147 PREG_SET_ORDER ) ) {
1152 foreach ( $pairs as $set ) {
1153 $attribute = strtolower( $set[1] );
1155 // Filter attribute names with unacceptable characters
1156 if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1160 $value = self::getTagAttributeCallback( $set );
1162 // Normalize whitespace
1163 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1164 $value = trim( $value );
1166 // Decode character references
1167 $attribs[$attribute] = self::decodeCharReferences( $value );
1179 public static function safeEncodeTagAttributes( $assoc_array ) {
1181 foreach ( $assoc_array as $attribute => $value ) {
1182 $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1183 $encValue = self::safeEncodeAttribute( $value );
1185 $attribs[] = "$encAttribute=\"$encValue\"";
1187 return count( $attribs ) ?
' ' . implode(
' ', $attribs ) :
'';
1197 private static function getTagAttributeCallback( $set ) {
1198 if ( isset( $set[5] ) ) {
1201 } elseif ( isset( $set[4] ) ) {
1204 } elseif ( isset( $set[3] ) ) {
1207 } elseif ( !isset( $set[2] ) ) {
1208 # In XHTML, attributes must have a value so return an empty string.
1209 # See "Empty attribute syntax",
1213 throw new LogicException(
"Tag conditions not met. This should never happen and is a bug." );
1221 private static function normalizeWhitespace( $text ) {
1222 return trim( preg_replace(
1223 '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1237 return trim( preg_replace(
'/[ _]+/',
' ', $section ) );
1256 return preg_replace_callback(
1257 self::CHAR_REFS_REGEX,
1258 [ self::class,
'normalizeCharReferencesCallback' ],
1266 private static function normalizeCharReferencesCallback(
$matches ) {
1269 $ret = self::normalizeEntity(
$matches[1] );
1271 $ret = self::decCharReference(
$matches[2] );
1273 $ret = self::hexCharReference(
$matches[3] );
1275 if ( $ret ===
null ) {
1276 return htmlspecialchars(
$matches[0], ENT_COMPAT );
1292 private static function normalizeEntity( $name ) {
1293 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1295 return '&' . self::MW_ENTITY_ALIASES[$name];
1296 } elseif ( in_array( $name, [
'lt;',
'gt;',
'amp;',
'quot;' ],
true ) ) {
1299 } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1301 return preg_replace_callback(
'/./Ssu',
static function ( $m ) {
1302 return '&#' . \UtfNormal\Utils::utf8ToCodepoint( $m[0] ) .
';';
1303 }, HTMLData::$namedEntityTranslations[$name] );
1305 return "&$name";
1313 private static function decCharReference( $codepoint ) {
1314 # intval() will (safely) saturate at the maximum signed integer
1315 # value if $codepoint is too many digits
1316 $point = intval( $codepoint );
1317 if ( self::validateCodepoint( $point ) ) {
1318 return sprintf(
'&#%d;', $point );
1328 private static function hexCharReference( $codepoint ) {
1329 # hexdec() will return a float (not an int) if $codepoint is too
1330 # long, so protect against that. The largest valid codepoint is
1332 if ( strlen( ltrim( $codepoint,
'0' ) ) > 6 ) {
1335 $point = hexdec( $codepoint );
1336 if ( self::validateCodepoint( $point ) ) {
1337 return sprintf(
'&#x%x;', $point );
1349 private static function validateCodepoint( $codepoint ) {
1350 # U+000C is valid in HTML5 but not allowed in XML.
1351 # U+000D is valid in XML but not allowed in HTML5.
1352 # U+007F - U+009F are disallowed in HTML5 (control characters).
1353 return $codepoint == 0x09
1354 || $codepoint == 0x0a
1355 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1356 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1357 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1358 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1369 return preg_replace_callback(
1370 self::CHAR_REFS_REGEX,
1371 [ self::class,
'decodeCharReferencesCallback' ],
1386 $text = preg_replace_callback(
1387 self::CHAR_REFS_REGEX,
1388 [ self::class,
'decodeCharReferencesCallback' ],
1395 return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1405 private static function decodeCharReferencesCallback(
$matches ) {
1407 return self::decodeEntity(
$matches[1] );
1409 return self::decodeChar( intval(
$matches[2] ) );
1411 # hexdec will return a float if the string is too long (!) so
1412 # check the length of the string first.
1413 if ( strlen( ltrim(
$matches[3],
'0' ) ) > 6 ) {
1415 return \UtfNormal\Constants::UTF8_REPLACEMENT;
1417 return self::decodeChar( hexdec(
$matches[3] ) );
1419 # Last case should be an ampersand by itself
1430 private static function decodeChar( $codepoint ) {
1431 if ( self::validateCodepoint( $codepoint ) ) {
1432 return \UtfNormal\Utils::codepointToUtf8( $codepoint );
1434 return \UtfNormal\Constants::UTF8_REPLACEMENT;
1446 private static function decodeEntity( $name ) {
1448 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1449 $name = self::MW_ENTITY_ALIASES[$name];
1451 $trans = HTMLData::$namedEntityTranslations[$name] ??
null;
1452 return $trans ??
"&$name";
1462 private static function attributesAllowedInternal( $element ) {
1463 $list = self::setupAttributesAllowedInternal();
1464 return $list[$element] ?? [];
1474 private static function setupAttributesAllowedInternal() {
1477 if ( $allowed !==
null ) {
1483 $merge =
static function ( $a, $b, $c = [] ) {
1486 array_fill_keys( $b,
true ),
1487 array_fill_keys( $c,
true ) );
1489 $common = $merge( [], [
1510 # These attributes are specified in section 9 of
1518 # Microdata. These are specified by
1527 $block = $merge( $common, [
'align' ] );
1529 $tablealign = [
'align',
'valign' ];
1537 'nowrap', # deprecated
1538 'width', # deprecated
1539 'height', # deprecated
1540 'bgcolor', # deprecated
1543 # Numbers refer to sections in HTML 4.01 standard describing the element.
1548 'center' => $common, # deprecated
1567 'strong' => $common,
1578 'blockquote' => $merge( $common, [
'cite' ] ),
1579 'q' => $merge( $common, [
'cite' ] ),
1589 'br' => $merge( $common, [
'clear' ] ),
1595 'pre' => $merge( $common, [
'width' ] ),
1598 'ins' => $merge( $common, [
'cite',
'datetime' ] ),
1599 'del' => $merge( $common, [
'cite',
'datetime' ] ),
1602 'ul' => $merge( $common, [
'type' ] ),
1603 'ol' => $merge( $common, [
'type',
'start',
'reversed' ] ),
1604 'li' => $merge( $common, [
'type',
'value' ] ),
1612 'table' => $merge( $common,
1613 [
'summary',
'width',
'border',
'frame',
1614 'rules',
'cellspacing',
'cellpadding',
1619 'caption' => $block,
1627 'colgroup' => $merge( $common, [
'span' ] ),
1628 'col' => $merge( $common, [
'span' ] ),
1631 'tr' => $merge( $common, [
'bgcolor' ], $tablealign ),
1634 'td' => $merge( $common, $tablecell, $tablealign ),
1635 'th' => $merge( $common, $tablecell, $tablealign ),
1638 # NOTE: <a> is not allowed directly, but this list of allowed
1639 # attributes is used from the Parser object
1640 'a' => $merge( $common, [
'href',
'rel',
'rev' ] ), # rel/rev esp.
for RDFa
1643 # Not usually allowed, but may be used for extension-style hooks
1644 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1646 'img' => $merge( $common, [
'alt',
'src',
'width',
'height',
'srcset' ] ),
1647 # Attributes for A/V tags added in T163583 / T133673
1648 'audio' => $merge( $common, [
'controls',
'preload',
'width',
'height' ] ),
1649 'video' => $merge( $common, [
'poster',
'controls',
'preload',
'width',
'height' ] ),
1650 'source' => $merge( $common, [
'type',
'src' ] ),
1651 'track' => $merge( $common, [
'type',
'src',
'srclang',
'kind',
'label' ] ),
1659 'strike' => $common,
1664 'font' => $merge( $common, [
'size',
'color',
'face' ] ),
1668 'hr' => $merge( $common, [
'width' ] ),
1670 # HTML Ruby annotation text module, simple ruby only.
1676 'rt' => $common, # $merge( $common, [
'rbspan' ] ),
1679 # MathML root element, where used for extensions
1680 # 'title' may not be 100% valid here; it's XHTML
1682 'math' => $merge( [], [
'class',
'style',
'id',
'title' ] ),
1685 'figure' => $common,
1686 'figcaption' => $common,
1688 # HTML 5 section 4.6
1691 # HTML5 elements, defined by:
1693 'data' => $merge( $common, [
'value' ] ),
1694 'time' => $merge( $common, [
'datetime' ] ),
1702 'meta' => $merge( [], [
'itemprop',
'content' ] ),
1703 'link' => $merge( [], [
'itemprop',
'href',
'title' ] ),
1705 # HTML 5 section 4.3.5
1726 $tokenizer =
new RemexTokenizer( $handler, $html, [
1727 'ignoreErrors' =>
true,
1729 'ignoreNulls' =>
true,
1730 'skipPreprocess' =>
true,
1732 $tokenizer->execute();
1733 $text = $handler->getResult();
1735 $text = self::normalizeWhitespace( $text );
1751 $out =
"<!DOCTYPE html [\n";
1752 foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1753 if ( substr( $entity, -1 ) !==
';' ) {
1758 $name = substr( $entity, 0, -1 );
1759 $expansion = self::normalizeEntity( $entity );
1760 if ( $entity === $expansion ) {
1764 $out .=
"<!ENTITY $name \"$expansion\">";
1775 # Normalize any HTML entities in input. They will be
1776 # re-escaped by makeExternalLink().
1777 $url = self::decodeCharReferences( $url );
1779 # Escape any control characters introduced by the above step
1780 $url = preg_replace_callback(
'/[\][<>"\\x00-\\x20\\x7F\|]+/',
1781 static fn ( $m ) => urlencode( $m[0] ), $url );
1783 # Validate hostname portion
1785 if ( preg_match(
'!^([^:]+:)(//[^/]+)?(.*)$!iD', $url,
$matches ) ) {
1786 [ , $protocol, $host, $rest ] =
$matches;
1793 \\s| # general whitespace
1794 \u{00AD}| # SOFT HYPHEN
1795 \u{034F}| # COMBINING GRAPHEME JOINER
1796 \u{061C}| # ARABIC LETTER MARK
1797 [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER..
1798 # HANGUL JUNGSEONG FILLER
1799 [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ..
1800 # KHMER VOWEL INHERENT AA
1801 [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE..
1802 # MONGOLIAN FREE VARIATION SELECTOR THREE
1803 \u{180E}| # MONGOLIAN VOWEL SEPARATOR
1804 [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE..
1805 # RIGHT-TO-LEFT MARK
1806 [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING..
1807 # RIGHT-TO-LEFT OVERRIDE
1808 [\u{2060}-\u{2064}]| # WORD JOINER..
1810 \u{2065}| # <reserved-2065>
1811 [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE..
1812 # NOMINAL DIGIT SHAPES
1813 \u{3164}| # HANGUL FILLER
1814 [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1..
1815 # VARIATION SELECTOR-16
1816 \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE
1817 \u{FFA0}| # HALFWIDTH HANGUL FILLER
1818 [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>..
1820 [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP..
1821 # SHORTHAND FORMAT UP STEP
1822 [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM..
1823 # MUSICAL SYMBOL END PHRASE
1824 \u{E0000}| # <reserved-E0000>
1825 \u{E0001}| # LANGUAGE TAG
1826 [\u{E0002}-\u{E001F}]| # <reserved-E0002>..
1828 [\u{E0020}-\u{E007F}]| # TAG SPACE..
1830 [\u{E0080}-\u{E00FF}]| # <reserved-E0080>..
1832 [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17..
1833 # VARIATION SELECTOR-256
1834 [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>..
1838 $host = preg_replace( $strip,
'', $host );
1841 if ( str_starts_with( $host,
"//%5B" ) &&
1842 preg_match(
'!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host,
$matches )
1849 return $protocol . $host . $rest;
1886 $hookRunner =
new HookRunner( MediaWikiServices::getInstance()->getHookContainer() );
1887 if ( !$hookRunner->onIsValidEmailAddr( $addr, $result ) ) {
1894 $rfc5322_atext =
"a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1895 $rfc1034_ldh_str =
"a-z0-9\\-";
1897 $html5_email_regexp =
"/
1899 [$rfc5322_atext\\.]+ # user part which is liberal :p
1901 [$rfc1034_ldh_str]+ # First domain part
1902 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1906 return (
bool)preg_match( $html5_email_regexp, $addr );
1914class_alias( Sanitizer::class,
'Sanitizer' );
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that a deprecated feature was used.
A collection of static methods to play with strings.
$wgAllowImageTag
Config variable stub for the AllowImageTag setting, for use by phpdoc and IDEs.
$wgFragmentMode
Config variable stub for the FragmentMode setting, for use by phpdoc and IDEs.
$wgExternalInterwikiFragmentMode
Config variable stub for the ExternalInterwikiFragmentMode setting, for use by phpdoc and IDEs.