29use InvalidArgumentException;
35use UnexpectedValueException;
36use Wikimedia\RemexHtml\HTMLData;
37use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
38use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
39use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
40use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
53 private const CHAR_REFS_REGEX =
54 '/&([A-Za-z0-9\x80-\xff]+;)
56 |&\#[xX]([0-9A-Fa-f]+);
63 private const ELEMENT_BITS_REGEX =
'!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
74 private const EVIL_URI_PATTERN =
'!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
75 private const XMLNS_ATTRIBUTE_PATTERN =
"/^xmlns:[:A-Z_a-z-.0-9]+$/";
96 private const MW_ENTITY_ALIASES = [
104 private static $attribsRegex;
112 private static function getAttribsRegex() {
113 if ( self::$attribsRegex ===
null ) {
114 $spaceChars =
'\x09\x0a\x0c\x0d\x20';
115 $space =
"[{$spaceChars}]";
116 $attrib =
"[^{$spaceChars}\/>=]";
117 $attribFirst =
"(?:{$attrib}|=)";
118 self::$attribsRegex =
119 "/({$attribFirst}{$attrib}*)
122 # The attribute value: quoted or alone
129 return self::$attribsRegex;
135 private static $attribNameRegex;
141 private static function getAttribNameRegex() {
142 if ( self::$attribNameRegex ===
null ) {
143 $attribFirst =
"[:_\p{L}\p{N}]";
144 $attrib =
"[:_\.\-\p{L}\p{N}]";
145 self::$attribNameRegex =
"/^({$attribFirst}{$attrib}*)$/sxu";
147 return self::$attribNameRegex;
159 static $commonCase, $staticInitialised;
160 $isCommonCase = ( $extratags === [] && $removetags === [] );
161 if ( $staticInitialised ===
$wgAllowImageTag && $isCommonCase && $commonCase ) {
165 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
166 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
171 if ( !$staticInitialised || $staticInitialised !== $globalContext ) {
172 $htmlpairsStatic = [ # Tags that must be closed
173 'b',
'bdi',
'del',
'i',
'ins',
'u',
'font',
'big',
'small',
'sub',
'sup',
'h1',
174 'h2',
'h3',
'h4',
'h5',
'h6',
'cite',
'code',
'em',
's',
175 'strike',
'strong',
'tt',
'var',
'div',
'center',
176 'blockquote',
'ol',
'ul',
'dl',
'table',
'caption',
'pre',
177 'ruby',
'rb',
'rp',
'rt',
'rtc',
'p',
'span',
'abbr',
'dfn',
178 'kbd',
'samp',
'data',
'time',
'mark'
180 # These tags can be self-closed. For tags not also on
181 # $htmlsingleonly, a self-closed tag will be emitted as
182 # an empty element (open-tag/close-tag pair).
184 'br',
'wbr',
'hr',
'li',
'dt',
'dd',
'meta',
'link'
187 # Elements that cannot have close tags. This is (not coincidentally)
188 # also the list of tags for which the HTML 5 parsing algorithm
189 # requires you to "acknowledge the token's self-closing flag", i.e.
190 # a self-closing tag like <br/> is not an HTML 5 parse error only
193 'br',
'wbr',
'hr',
'meta',
'link'
196 $htmlnest = [ # Tags that can be nested--??
197 'table',
'tr',
'td',
'th',
'div',
'blockquote',
'ol',
'ul',
198 'li',
'dl',
'dt',
'dd',
'font',
'big',
'small',
'sub',
'sup',
'span',
199 'var',
'kbd',
'samp',
'em',
'strong',
'q',
'ruby',
'bdo'
201 $tabletags = [ # Can only appear inside table, we will close them
204 $htmllist = [ # Tags used by list
207 $listtags = [ # Tags that can appear in a list
213 'is deprecated since MediaWiki 1.35',
'1.35',
false,
false );
214 $htmlsingle[] =
'img';
215 $htmlsingleonly[] =
'img';
218 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
219 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
221 # Convert them all to hashtables for faster lookup
222 $vars = [
'htmlpairsStatic',
'htmlsingle',
'htmlsingleonly',
'htmlnest',
'tabletags',
223 'htmllist',
'listtags',
'htmlsingleallowed',
'htmlelementsStatic' ];
224 foreach ( $vars as $var ) {
225 $$var = array_fill_keys( $$var,
true );
227 $staticInitialised = $globalContext;
230 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
231 $extratags = array_fill_keys( $extratags,
true );
232 $removetags = array_fill_keys( $removetags,
true );
234 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
236 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
239 'htmlpairs' => $htmlpairs,
240 'htmlsingle' => $htmlsingle,
241 'htmlsingleonly' => $htmlsingleonly,
242 'htmlnest' => $htmlnest,
243 'tabletags' => $tabletags,
244 'htmllist' => $htmllist,
245 'listtags' => $listtags,
246 'htmlsingleallowed' => $htmlsingleallowed,
247 'htmlelements' => $htmlelements,
249 if ( $isCommonCase ) {
250 $commonCase = $result;
286 $args = [], $extratags = [], $removetags = []
290 $text, $processCallback, $args, $extratags, $removetags
323 $args = [], $extratags = [], $removetags = []
326 $htmlsingle = $tagData[
'htmlsingle'];
327 $htmlsingleonly = $tagData[
'htmlsingleonly'];
328 $htmlelements = $tagData[
'htmlelements'];
330 # Remove HTML comments
332 $bits = explode(
'<', $text );
333 $text = str_replace(
'>',
'>', array_shift( $bits ) );
335 # this might be possible using remex tidy itself
336 foreach ( $bits as $x ) {
337 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
338 [ , $slash, $t, $params, $brace, $rest ] = $regs;
341 $t = strtolower( $t );
342 if ( isset( $htmlelements[$t] ) ) {
343 if ( is_callable( $processCallback ) ) {
344 call_user_func_array( $processCallback, [ &$params, $args ] );
347 if ( $brace ==
'/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
352 if ( !self::validateTag( $params, $t ) ) {
358 if ( $brace ===
'/>' && !isset( $htmlsingleonly[$t] ) ) {
359 # Interpret self-closing tags as empty tags even when
360 # HTML 5 would interpret them as start tags. Such input
361 # is commonly seen on Wikimedia wikis with this intention.
365 $rest = str_replace(
'>',
'>', $rest );
366 $text .=
"<$slash$t$newparams$brace$rest";
371 $text .=
'<' . str_replace(
'>',
'>', $x );
398 string $text, array $options = []
400 $extraTags = $options[
'extraTags'] ?? [];
401 $removeTags = $options[
'removeTags'] ?? [];
403 $attrCallback = $options[
'attrCallback'] ??
null;
404 $attrCallbackArgs = $options[
'attrCallbackArgs'] ?? [];
413 $serializer =
new RemexSerializer( $formatter );
414 $treeBuilder =
new RemexTreeBuilder( $serializer, [
415 'ignoreErrors' =>
true,
416 'ignoreNulls' =>
true,
418 $dispatcher =
new RemexDispatcher( $treeBuilder );
419 $tokenHandler = $dispatcher;
421 $tokenHandler, $text, $tagData,
422 $attrCallback, $attrCallbackArgs
424 $tokenizer =
new RemexTokenizer( $remover, $text, [
425 'ignoreErrors' =>
true,
427 'ignoreNulls' =>
true,
428 'skipPreprocess' =>
true,
430 $tokenizer->execute( [
431 'fragmentNamespace' => HTMLData::NS_HTML,
432 'fragmentName' =>
'body',
434 return $serializer->getResult();
447 while ( ( $start = strpos( $text,
'<!--' ) ) !==
false ) {
448 $end = strpos( $text,
'-->', $start + 4 );
449 if ( $end ===
false ) {
450 # Unterminated comment; bail out
456 # Trim space and newline if the comment is both
457 # preceded and followed by a newline
458 $spaceStart = max( $start - 1, 0 );
459 $spaceLen = $end - $spaceStart;
460 while ( substr( $text, $spaceStart, 1 ) ===
' ' && $spaceStart > 0 ) {
464 while ( substr( $text, $spaceStart + $spaceLen, 1 ) ===
' ' ) {
467 if ( substr( $text, $spaceStart, 1 ) ===
"\n"
468 && substr( $text, $spaceStart + $spaceLen, 1 ) ===
"\n" ) {
469 # Remove the comment, leading and trailing
470 # spaces, and leave only one newline.
471 $text = substr_replace( $text,
"\n", $spaceStart, $spaceLen + 1 );
473 # Remove just the comment.
474 $text = substr_replace( $text,
'', $start, $end - $start );
494 private static function validateTag( $params, $element ) {
495 $params = self::decodeTagAttributes( $params );
497 if ( $element ==
'meta' || $element ==
'link' ) {
498 if ( !isset( $params[
'itemprop'] ) ) {
502 if ( $element ==
'meta' && !isset( $params[
'content'] ) ) {
506 if ( $element ==
'link' && !isset( $params[
'href'] ) ) {
531 return self::validateAttributes( $attribs,
532 self::attributesAllowedInternal( $element ) );
554 if ( isset( $allowed[0] ) ) {
557 wfDeprecated( __METHOD__ .
' with sequential array',
'1.35' );
558 $allowed = array_fill_keys( $allowed,
true );
563 foreach ( $attribs as $attribute => $value ) {
564 # Allow XML namespace declaration to allow RDFa
565 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
566 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
567 $out[$attribute] = $value;
573 # Allow any attribute beginning with "data-"
575 # * Disallow data attributes used by MediaWiki code
576 # * Ensure that the attribute is not namespaced by banning
579 !preg_match(
'/^data-[^:]*$/i', $attribute ) &&
580 !array_key_exists( $attribute, $allowed )
581 ) || self::isReservedDataAttribute( $attribute ) ) {
585 # Strip javascript "expression" from stylesheets.
587 if ( $attribute ==
'style' ) {
588 $value = self::checkCss( $value );
591 # Escape HTML id attributes
592 if ( $attribute ===
'id' ) {
593 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
596 # Escape HTML id reference lists
597 if ( $attribute ===
'aria-describedby'
598 || $attribute ===
'aria-flowto'
599 || $attribute ===
'aria-labelledby'
600 || $attribute ===
'aria-owns'
602 $value = self::escapeIdReferenceListInternal( $value );
606 if ( $attribute ===
'rel' || $attribute ===
'rev'
608 || $attribute ===
'about' || $attribute ===
'property'
609 || $attribute ===
'resource' || $attribute ===
'datatype'
610 || $attribute ===
'typeof'
612 || $attribute ===
'itemid' || $attribute ===
'itemprop'
613 || $attribute ===
'itemref' || $attribute ===
'itemscope'
614 || $attribute ===
'itemtype'
617 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
622 # NOTE: even though elements using href/src are not allowed directly, supply
623 # validation code that can be used by tag hook handlers, etc
624 if ( $attribute ===
'href' || $attribute ===
'src' || $attribute ===
'poster' ) {
625 if ( !preg_match( $hrefExp, $value ) ) {
631 if ( $attribute ===
'tabindex' && $value !==
'0' ) {
638 $out[$attribute] = $value;
641 # itemtype, itemid, itemref don't make sense without itemscope
642 if ( !array_key_exists(
'itemscope', $out ) ) {
643 unset( $out[
'itemtype'] );
644 unset( $out[
'itemid'] );
645 unset( $out[
'itemref'] );
647 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
667 return (
bool)preg_match(
'/^data-(ooui|mw|parsoid)/i', $attr );
681 $out = array_merge( $a, $b );
682 if ( isset( $a[
'class'] ) && isset( $b[
'class'] )
683 && is_string( $a[
'class'] ) && is_string( $b[
'class'] )
684 && $a[
'class'] !== $b[
'class']
686 $classes = preg_split(
'/\s+/',
"{$a['class']} {$b['class']}",
687 -1, PREG_SPLIT_NO_EMPTY );
688 $out[
'class'] = implode(
' ', array_unique( $classes ) );
703 $value = self::decodeCharReferences( $value );
715 if ( !$decodeRegex ) {
716 $space =
'[\\x20\\t\\r\\n\\f]';
717 $nl =
'(?:\\n|\\r\\n|\\r|\\f)';
719 $decodeRegex =
"/ $backslash
721 ($nl) | # 1. Line continuation
722 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
723 (.) | # 3. backslash cancelling special meaning
724 () | # 4. backslash at end of string
727 $value = preg_replace_callback( $decodeRegex,
728 [ __CLASS__,
'cssDecodeCallback' ], $value );
733 if ( !preg_match(
'! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
740 $value = StringUtils::delimiterReplace(
'/*',
'*/',
' ', $value );
744 $commentPos = strpos( $value,
'/*' );
745 if ( $commentPos !==
false ) {
746 $value = substr( $value, 0, $commentPos );
772 $value = self::normalizeCss( $value );
775 if ( preg_match(
'/[\000-\010\013\016-\037\177]/', $value ) ||
776 strpos( $value, \UtfNormal\Constants::UTF8_REPLACEMENT ) !==
false ) {
777 return '/* invalid control char */';
778 } elseif ( preg_match(
788 | attr\s*\([^)]+[\s,]+url
790 return '/* insecure input */';
799 private static function cssDecodeCallback(
$matches ) {
804 # hexdec could return a float if the match is too long, but the
805 # regexp in question limits the string length to 6.
806 $char = \UtfNormal\Utils::codepointToUtf8( hexdec(
$matches[2] ) );
812 if ( $char ==
"\n" || $char ==
'"' || $char ==
"'" || $char ==
'\\' ) {
815 return '\\' . dechex( ord( $char ) ) .
' ';
844 if ( trim( $text ) ==
'' ) {
848 $decoded = self::decodeTagAttributes( $text );
849 $stripped = self::validateTagAttributes( $decoded, $element );
855 return self::safeEncodeTagAttributes( $stripped );
866 $encValue = htmlspecialchars( $text, ENT_QUOTES );
871 $encValue = strtr( $encValue, [
892 # French spaces, last one Guillemet-left
893 # only if it isn't followed by a word character.
894 '/ (?=[?:;!%»›](?!\w))/u' =>
"$space",
895 # French spaces, Guillemet-right
896 '/([«‹]) /u' =>
"\\1$space",
898 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
910 $encValue = self::encodeAttribute( $text );
912 # Templates and links may be expanded in later parsing,
913 # creating invalid or dangerous output. Suppress this.
914 $encValue = strtr( $encValue, [
922 "''" =>
'''',
923 'ISBN' =>
'ISBN',
925 'PMID' =>
'PMID',
931 $encValue = preg_replace_callback(
934 return str_replace(
':',
':',
$matches[1] );
959 if ( $mode === self::ID_PRIMARY ) {
960 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
967 return self::escapeIdInternal( $id, $internalMode );
986 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
991 $id = self::escapeIdInternalUrl( $id, $mode );
1022 private static function escapeIdInternalUrl( $id, $mode ) {
1023 $id = self::escapeIdInternal( $id, $mode );
1024 if ( $mode ===
'html5' ) {
1025 $id = preg_replace(
'/%([a-fA-F0-9]{2})/',
'%25$1', $id );
1037 private static function escapeIdInternal( $id, $mode ) {
1040 $id = mb_substr( $id, 0, 1024 );
1048 $id = str_replace( [
"\t",
"\n",
"\f",
"\r",
" " ],
'_', $id );
1057 $id = urlencode( str_replace(
' ',
'_', $id ) );
1058 $id = strtr( $id, $replace );
1061 throw new InvalidArgumentException(
"Invalid mode '$mode' passed to '" . __METHOD__ );
1074 private static function escapeIdReferenceListInternal( $referenceString ) {
1075 # Explode the space delimited list string into an array of tokens
1076 $references = preg_split(
'/\s+/',
"{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1078 # Escape each token as an id
1079 foreach ( $references as &$ref ) {
1080 $ref = self::escapeIdForAttribute( $ref );
1083 # Merge the array back to a space delimited list string
1084 # If the array is empty, the result will be an empty string ('')
1085 $referenceString = implode(
' ', $references );
1087 return $referenceString;
1103 return rtrim( preg_replace(
1104 [
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/' ],
1119 $html = self::decodeCharReferences( $html );
1120 # It seems wise to escape ' as well as ", as a matter of course. Can't
1121 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1122 # don't cause the entire string to disappear.
1123 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1135 public static function decodeTagAttributes( $text ) {
1136 if ( trim( $text ) == '' ) {
1141 if ( !preg_match_all(
1142 self::getAttribsRegex(),
1145 PREG_SET_ORDER ) ) {
1150 foreach ( $pairs as $set ) {
1151 $attribute = strtolower( $set[1] );
1153 // Filter attribute names with unacceptable characters
1154 if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1158 $value = self::getTagAttributeCallback( $set );
1160 // Normalize whitespace
1161 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1162 $value = trim( $value );
1164 // Decode character references
1165 $attribs[$attribute] = self::decodeCharReferences( $value );
1177 public static function safeEncodeTagAttributes( $assoc_array ) {
1179 foreach ( $assoc_array as $attribute => $value ) {
1180 $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1181 $encValue = self::safeEncodeAttribute( $value );
1183 $attribs[] = "$encAttribute=\"$encValue\"";
1185 return count( $attribs ) ?
' ' . implode(
' ', $attribs ) :
'';
1195 private static function getTagAttributeCallback( $set ) {
1196 if ( isset( $set[5] ) ) {
1199 } elseif ( isset( $set[4] ) ) {
1202 } elseif ( isset( $set[3] ) ) {
1205 } elseif ( !isset( $set[2] ) ) {
1206 # In XHTML, attributes must have a value so return an empty string.
1207 # See "Empty attribute syntax",
1211 throw new LogicException(
"Tag conditions not met. This should never happen and is a bug." );
1219 private static function normalizeWhitespace( $text ) {
1220 return trim( preg_replace(
1221 '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1235 return trim( preg_replace(
'/[ _]+/',
' ', $section ) );
1254 return preg_replace_callback(
1255 self::CHAR_REFS_REGEX,
1256 [ self::class,
'normalizeCharReferencesCallback' ],
1264 private static function normalizeCharReferencesCallback(
$matches ) {
1267 $ret = self::normalizeEntity(
$matches[1] );
1269 $ret = self::decCharReference(
$matches[2] );
1271 $ret = self::hexCharReference(
$matches[3] );
1273 if ( $ret ===
null ) {
1274 return htmlspecialchars(
$matches[0], ENT_COMPAT );
1290 private static function normalizeEntity( $name ) {
1291 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1293 return '&' . self::MW_ENTITY_ALIASES[$name];
1294 } elseif ( in_array( $name, [
'lt;',
'gt;',
'amp;',
'quot;' ],
true ) ) {
1297 } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1299 return preg_replace_callback(
'/./Ssu',
static function ( $m ) {
1300 return '&#' . \UtfNormal\Utils::utf8ToCodepoint( $m[0] ) .
';';
1301 }, HTMLData::$namedEntityTranslations[$name] );
1303 return "&$name";
1311 private static function decCharReference( $codepoint ) {
1312 # intval() will (safely) saturate at the maximum signed integer
1313 # value if $codepoint is too many digits
1314 $point = intval( $codepoint );
1315 if ( self::validateCodepoint( $point ) ) {
1316 return sprintf(
'&#%d;', $point );
1326 private static function hexCharReference( $codepoint ) {
1327 # hexdec() will return a float (not an int) if $codepoint is too
1328 # long, so protect against that. The largest valid codepoint is
1330 if ( strlen( ltrim( $codepoint,
'0' ) ) > 6 ) {
1333 $point = hexdec( $codepoint );
1334 if ( self::validateCodepoint( $point ) ) {
1335 return sprintf(
'&#x%x;', $point );
1347 private static function validateCodepoint( $codepoint ) {
1348 # U+000C is valid in HTML5 but not allowed in XML.
1349 # U+000D is valid in XML but not allowed in HTML5.
1350 # U+007F - U+009F are disallowed in HTML5 (control characters).
1351 return $codepoint == 0x09
1352 || $codepoint == 0x0a
1353 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1354 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1355 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1356 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1367 return preg_replace_callback(
1368 self::CHAR_REFS_REGEX,
1369 [ self::class,
'decodeCharReferencesCallback' ],
1384 $text = preg_replace_callback(
1385 self::CHAR_REFS_REGEX,
1386 [ self::class,
'decodeCharReferencesCallback' ],
1393 return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1403 private static function decodeCharReferencesCallback(
$matches ) {
1405 return self::decodeEntity(
$matches[1] );
1407 return self::decodeChar( intval(
$matches[2] ) );
1409 # hexdec will return a float if the string is too long (!) so
1410 # check the length of the string first.
1411 if ( strlen( ltrim(
$matches[3],
'0' ) ) > 6 ) {
1413 return \UtfNormal\Constants::UTF8_REPLACEMENT;
1415 return self::decodeChar( hexdec(
$matches[3] ) );
1417 # Last case should be an ampersand by itself
1428 private static function decodeChar( $codepoint ) {
1429 if ( self::validateCodepoint( $codepoint ) ) {
1430 return \UtfNormal\Utils::codepointToUtf8( $codepoint );
1432 return \UtfNormal\Constants::UTF8_REPLACEMENT;
1444 private static function decodeEntity( $name ) {
1446 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1447 $name = self::MW_ENTITY_ALIASES[$name];
1449 $trans = HTMLData::$namedEntityTranslations[$name] ??
null;
1450 return $trans ??
"&$name";
1460 private static function attributesAllowedInternal( $element ) {
1461 $list = self::setupAttributesAllowedInternal();
1462 return $list[$element] ?? [];
1472 private static function setupAttributesAllowedInternal() {
1475 if ( $allowed !==
null ) {
1481 $merge =
static function ( $a, $b, $c = [] ) {
1484 array_fill_keys( $b,
true ),
1485 array_fill_keys( $c,
true ) );
1487 $common = $merge( [], [
1508 # These attributes are specified in section 9 of
1516 # Microdata. These are specified by
1525 $block = $merge( $common, [
'align' ] );
1527 $tablealign = [
'align',
'valign' ];
1535 'nowrap', # deprecated
1536 'width', # deprecated
1537 'height', # deprecated
1538 'bgcolor', # deprecated
1541 # Numbers refer to sections in HTML 4.01 standard describing the element.
1546 'center' => $common, # deprecated
1565 'strong' => $common,
1576 'blockquote' => $merge( $common, [
'cite' ] ),
1577 'q' => $merge( $common, [
'cite' ] ),
1587 'br' => $merge( $common, [
'clear' ] ),
1593 'pre' => $merge( $common, [
'width' ] ),
1596 'ins' => $merge( $common, [
'cite',
'datetime' ] ),
1597 'del' => $merge( $common, [
'cite',
'datetime' ] ),
1600 'ul' => $merge( $common, [
'type' ] ),
1601 'ol' => $merge( $common, [
'type',
'start',
'reversed' ] ),
1602 'li' => $merge( $common, [
'type',
'value' ] ),
1610 'table' => $merge( $common,
1611 [
'summary',
'width',
'border',
'frame',
1612 'rules',
'cellspacing',
'cellpadding',
1617 'caption' => $block,
1625 'colgroup' => $merge( $common, [
'span' ] ),
1626 'col' => $merge( $common, [
'span' ] ),
1629 'tr' => $merge( $common, [
'bgcolor' ], $tablealign ),
1632 'td' => $merge( $common, $tablecell, $tablealign ),
1633 'th' => $merge( $common, $tablecell, $tablealign ),
1636 # NOTE: <a> is not allowed directly, but this list of allowed
1637 # attributes is used from the Parser object
1638 'a' => $merge( $common, [
'href',
'rel',
'rev' ] ), # rel/rev esp.
for RDFa
1641 # Not usually allowed, but may be used for extension-style hooks
1642 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1644 'img' => $merge( $common, [
'alt',
'src',
'width',
'height',
'srcset' ] ),
1645 # Attributes for A/V tags added in T163583 / T133673
1646 'audio' => $merge( $common, [
'controls',
'preload',
'width',
'height' ] ),
1647 'video' => $merge( $common, [
'poster',
'controls',
'preload',
'width',
'height' ] ),
1648 'source' => $merge( $common, [
'type',
'src' ] ),
1649 'track' => $merge( $common, [
'type',
'src',
'srclang',
'kind',
'label' ] ),
1657 'strike' => $common,
1662 'font' => $merge( $common, [
'size',
'color',
'face' ] ),
1666 'hr' => $merge( $common, [
'width' ] ),
1668 # HTML Ruby annotation text module, simple ruby only.
1674 'rt' => $common, # $merge( $common, [
'rbspan' ] ),
1677 # MathML root element, where used for extensions
1678 # 'title' may not be 100% valid here; it's XHTML
1680 'math' => $merge( [], [
'class',
'style',
'id',
'title' ] ),
1683 'figure' => $common,
1684 'figcaption' => $common,
1686 # HTML 5 section 4.6
1689 # HTML5 elements, defined by:
1691 'data' => $merge( $common, [
'value' ] ),
1692 'time' => $merge( $common, [
'datetime' ] ),
1700 'meta' => $merge( [], [
'itemprop',
'content' ] ),
1701 'link' => $merge( [], [
'itemprop',
'href',
'title' ] ),
1703 # HTML 5 section 4.3.5
1724 $tokenizer =
new RemexTokenizer( $handler, $html, [
1725 'ignoreErrors' =>
true,
1727 'ignoreNulls' =>
true,
1728 'skipPreprocess' =>
true,
1730 $tokenizer->execute();
1731 $text = $handler->getResult();
1733 $text = self::normalizeWhitespace( $text );
1749 $out =
"<!DOCTYPE html [\n";
1750 foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1751 if ( substr( $entity, -1 ) !==
';' ) {
1756 $name = substr( $entity, 0, -1 );
1757 $expansion = self::normalizeEntity( $entity );
1758 if ( $entity === $expansion ) {
1762 $out .=
"<!ENTITY $name \"$expansion\">";
1773 # Normalize any HTML entities in input. They will be
1774 # re-escaped by makeExternalLink().
1775 $url = self::decodeCharReferences( $url );
1777 # Escape any control characters introduced by the above step
1778 $url = preg_replace_callback(
'/[\][<>"\\x00-\\x20\\x7F\|]/',
1779 [ __CLASS__,
'cleanUrlCallback' ], $url );
1781 # Validate hostname portion
1783 if ( preg_match(
'!^([^:]+:)(//[^/]+)?(.*)$!iD', $url,
$matches ) ) {
1784 [ , $protocol, $host, $rest ] =
$matches;
1791 \\s| # general whitespace
1792 \u{00AD}| # SOFT HYPHEN
1793 \u{034F}| # COMBINING GRAPHEME JOINER
1794 \u{061C}| # ARABIC LETTER MARK
1795 [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER..
1796 # HANGUL JUNGSEONG FILLER
1797 [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ..
1798 # KHMER VOWEL INHERENT AA
1799 [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE..
1800 # MONGOLIAN FREE VARIATION SELECTOR THREE
1801 \u{180E}| # MONGOLIAN VOWEL SEPARATOR
1802 [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE..
1803 # RIGHT-TO-LEFT MARK
1804 [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING..
1805 # RIGHT-TO-LEFT OVERRIDE
1806 [\u{2060}-\u{2064}]| # WORD JOINER..
1808 \u{2065}| # <reserved-2065>
1809 [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE..
1810 # NOMINAL DIGIT SHAPES
1811 \u{3164}| # HANGUL FILLER
1812 [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1..
1813 # VARIATION SELECTOR-16
1814 \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE
1815 \u{FFA0}| # HALFWIDTH HANGUL FILLER
1816 [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>..
1818 [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP..
1819 # SHORTHAND FORMAT UP STEP
1820 [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM..
1821 # MUSICAL SYMBOL END PHRASE
1822 \u{E0000}| # <reserved-E0000>
1823 \u{E0001}| # LANGUAGE TAG
1824 [\u{E0002}-\u{E001F}]| # <reserved-E0002>..
1826 [\u{E0020}-\u{E007F}]| # TAG SPACE..
1828 [\u{E0080}-\u{E00FF}]| # <reserved-E0080>..
1830 [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17..
1831 # VARIATION SELECTOR-256
1832 [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>..
1836 $host = preg_replace( $strip,
'', $host );
1839 if ( str_starts_with( $host,
"//%5B" ) &&
1840 preg_match(
'!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host,
$matches )
1847 return $protocol . $host . $rest;
1857 private static function cleanUrlCallback(
$matches ) {
1892 $hookRunner =
new HookRunner( MediaWikiServices::getInstance()->getHookContainer() );
1893 if ( !$hookRunner->onIsValidEmailAddr( $addr, $result ) ) {
1900 $rfc5322_atext =
"a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1901 $rfc1034_ldh_str =
"a-z0-9\\-";
1903 $html5_email_regexp =
"/
1905 [$rfc5322_atext\\.]+ # user part which is liberal :p
1907 [$rfc1034_ldh_str]+ # First domain part
1908 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1912 return (
bool)preg_match( $html5_email_regexp, $addr );
1920class_alias( Sanitizer::class,
'Sanitizer' );
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that a deprecated feature was used.
A collection of static methods to play with strings.
$wgAllowImageTag
Config variable stub for the AllowImageTag setting, for use by phpdoc and IDEs.
$wgFragmentMode
Config variable stub for the FragmentMode setting, for use by phpdoc and IDEs.
$wgExternalInterwikiFragmentMode
Config variable stub for the ExternalInterwikiFragmentMode setting, for use by phpdoc and IDEs.