31 use Wikimedia\RemexHtml\HTMLData;
32 use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
33 use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
34 use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
35 use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
48 private const CHAR_REFS_REGEX =
49 '/&([A-Za-z0-9\x80-\xff]+;)
51 |&\#[xX]([0-9A-Fa-f]+);
58 private const ELEMENT_BITS_REGEX =
'!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
69 private const EVIL_URI_PATTERN =
'!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
70 private const XMLNS_ATTRIBUTE_PATTERN =
"/^xmlns:[:A-Z_a-z-.0-9]+$/";
91 private const MW_ENTITY_ALIASES = [
99 private static $attribsRegex;
107 private static function getAttribsRegex() {
108 if ( self::$attribsRegex ===
null ) {
109 $spaceChars =
'\x09\x0a\x0c\x0d\x20';
110 $space =
"[{$spaceChars}]";
111 $attrib =
"[^{$spaceChars}\/>=]";
112 $attribFirst =
"(?:{$attrib}|=)";
113 self::$attribsRegex =
114 "/({$attribFirst}{$attrib}*)
117 # The attribute value: quoted or alone
124 return self::$attribsRegex;
130 private static $attribNameRegex;
136 private static function getAttribNameRegex() {
137 if ( self::$attribNameRegex ===
null ) {
138 $attribFirst =
"[:_\p{L}\p{N}]";
139 $attrib =
"[:_\.\-\p{L}\p{N}]";
140 self::$attribNameRegex =
"/^({$attribFirst}{$attrib}*)$/sxu";
142 return self::$attribNameRegex;
154 static $commonCase, $staticInitialised;
155 $isCommonCase = ( $extratags === [] && $removetags === [] );
156 if ( $staticInitialised ===
$wgAllowImageTag && $isCommonCase && $commonCase ) {
160 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
161 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
166 if ( !$staticInitialised || $staticInitialised !== $globalContext ) {
167 $htmlpairsStatic = [ # Tags that must be closed
168 'b',
'bdi',
'del',
'i',
'ins',
'u',
'font',
'big',
'small',
'sub',
'sup',
'h1',
169 'h2',
'h3',
'h4',
'h5',
'h6',
'cite',
'code',
'em',
's',
170 'strike',
'strong',
'tt',
'var',
'div',
'center',
171 'blockquote',
'ol',
'ul',
'dl',
'table',
'caption',
'pre',
172 'ruby',
'rb',
'rp',
'rt',
'rtc',
'p',
'span',
'abbr',
'dfn',
173 'kbd',
'samp',
'data',
'time',
'mark'
175 # These tags can be self-closed. For tags not also on
176 # $htmlsingleonly, a self-closed tag will be emitted as
177 # an empty element (open-tag/close-tag pair).
179 'br',
'wbr',
'hr',
'li',
'dt',
'dd',
'meta',
'link'
182 # Elements that cannot have close tags. This is (not coincidentally)
183 # also the list of tags for which the HTML 5 parsing algorithm
184 # requires you to "acknowledge the token's self-closing flag", i.e.
185 # a self-closing tag like <br/> is not an HTML 5 parse error only
188 'br',
'wbr',
'hr',
'meta',
'link'
191 $htmlnest = [ # Tags that can be nested--??
192 'table',
'tr',
'td',
'th',
'div',
'blockquote',
'ol',
'ul',
193 'li',
'dl',
'dt',
'dd',
'font',
'big',
'small',
'sub',
'sup',
'span',
194 'var',
'kbd',
'samp',
'em',
'strong',
'q',
'ruby',
'bdo'
196 $tabletags = [ # Can only appear inside table, we will close them
199 $htmllist = [ # Tags used by list
202 $listtags = [ # Tags that can appear in a list
208 'is deprecated since MediaWiki 1.35',
'1.35',
false,
false );
209 $htmlsingle[] =
'img';
210 $htmlsingleonly[] =
'img';
213 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
214 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
216 # Convert them all to hashtables for faster lookup
217 $vars = [
'htmlpairsStatic',
'htmlsingle',
'htmlsingleonly',
'htmlnest',
'tabletags',
218 'htmllist',
'listtags',
'htmlsingleallowed',
'htmlelementsStatic' ];
219 foreach ( $vars as $var ) {
220 $$var = array_fill_keys( $$var,
true );
222 $staticInitialised = $globalContext;
225 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
226 $extratags = array_fill_keys( $extratags,
true );
227 $removetags = array_fill_keys( $removetags,
true );
229 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
231 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
234 'htmlpairs' => $htmlpairs,
235 'htmlsingle' => $htmlsingle,
236 'htmlsingleonly' => $htmlsingleonly,
237 'htmlnest' => $htmlnest,
238 'tabletags' => $tabletags,
239 'htmllist' => $htmllist,
240 'listtags' => $listtags,
241 'htmlsingleallowed' => $htmlsingleallowed,
242 'htmlelements' => $htmlelements,
244 if ( $isCommonCase ) {
245 $commonCase = $result;
281 $args = [], $extratags = [], $removetags = []
285 $text, $processCallback, $args, $extratags, $removetags
318 $args = [], $extratags = [], $removetags = []
321 $htmlsingle = $tagData[
'htmlsingle'];
322 $htmlsingleonly = $tagData[
'htmlsingleonly'];
323 $htmlelements = $tagData[
'htmlelements'];
325 # Remove HTML comments
327 $bits = explode(
'<', $text );
328 $text = str_replace(
'>',
'>', array_shift( $bits ) );
330 # this might be possible using remex tidy itself
331 foreach ( $bits as $x ) {
332 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
333 [ , $slash,
$t, $params, $brace, $rest ] = $regs;
336 $t = strtolower(
$t );
337 if ( isset( $htmlelements[
$t] ) ) {
338 if ( is_callable( $processCallback ) ) {
339 call_user_func_array( $processCallback, [ &$params, $args ] );
342 if ( $brace ==
'/>' && !( isset( $htmlsingle[
$t] ) || isset( $htmlsingleonly[
$t] ) ) ) {
347 if ( !self::validateTag( $params,
$t ) ) {
353 if ( $brace ===
'/>' && !isset( $htmlsingleonly[
$t] ) ) {
354 # Interpret self-closing tags as empty tags even when
355 # HTML 5 would interpret them as start tags. Such input
356 # is commonly seen on Wikimedia wikis with this intention.
360 $rest = str_replace(
'>',
'>', $rest );
361 $text .=
"<$slash$t$newparams$brace$rest";
366 $text .=
'<' . str_replace(
'>',
'>', $x );
393 string $text, array $options = []
395 $extraTags = $options[
'extraTags'] ?? [];
396 $removeTags = $options[
'removeTags'] ?? [];
398 $attrCallback = $options[
'attrCallback'] ??
null;
399 $attrCallbackArgs = $options[
'attrCallbackArgs'] ?? [];
408 $serializer =
new RemexSerializer( $formatter );
409 $treeBuilder =
new RemexTreeBuilder( $serializer, [
410 'ignoreErrors' =>
true,
411 'ignoreNulls' =>
true,
413 $dispatcher =
new RemexDispatcher( $treeBuilder );
414 $tokenHandler = $dispatcher;
416 $tokenHandler, $text, $tagData,
417 $attrCallback, $attrCallbackArgs
419 $tokenizer =
new RemexTokenizer( $remover, $text, [
420 'ignoreErrors' =>
true,
422 'ignoreNulls' =>
true,
423 'skipPreprocess' =>
true,
425 $tokenizer->execute( [
426 'fragmentNamespace' => HTMLData::NS_HTML,
427 'fragmentName' =>
'body',
429 return $serializer->getResult();
442 while ( ( $start = strpos( $text,
'<!--' ) ) !==
false ) {
443 $end = strpos( $text,
'-->', $start + 4 );
444 if ( $end ===
false ) {
445 # Unterminated comment; bail out
451 # Trim space and newline if the comment is both
452 # preceded and followed by a newline
453 $spaceStart = max( $start - 1, 0 );
454 $spaceLen = $end - $spaceStart;
455 while ( substr( $text, $spaceStart, 1 ) ===
' ' && $spaceStart > 0 ) {
459 while ( substr( $text, $spaceStart + $spaceLen, 1 ) ===
' ' ) {
462 if ( substr( $text, $spaceStart, 1 ) ===
"\n"
463 && substr( $text, $spaceStart + $spaceLen, 1 ) ===
"\n" ) {
464 # Remove the comment, leading and trailing
465 # spaces, and leave only one newline.
466 $text = substr_replace( $text,
"\n", $spaceStart, $spaceLen + 1 );
468 # Remove just the comment.
469 $text = substr_replace( $text,
'', $start, $end - $start );
489 private static function validateTag( $params, $element ) {
490 $params = self::decodeTagAttributes( $params );
492 if ( $element ==
'meta' || $element ==
'link' ) {
493 if ( !isset( $params[
'itemprop'] ) ) {
497 if ( $element ==
'meta' && !isset( $params[
'content'] ) ) {
501 if ( $element ==
'link' && !isset( $params[
'href'] ) ) {
526 return self::validateAttributes( $attribs,
527 self::attributesAllowedInternal( $element ) );
549 if ( isset( $allowed[0] ) ) {
552 wfDeprecated( __METHOD__ .
' with sequential array',
'1.35' );
553 $allowed = array_fill_keys( $allowed,
true );
558 foreach ( $attribs as $attribute => $value ) {
559 # Allow XML namespace declaration to allow RDFa
560 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
561 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
562 $out[$attribute] = $value;
568 # Allow any attribute beginning with "data-"
570 # * Disallow data attributes used by MediaWiki code
571 # * Ensure that the attribute is not namespaced by banning
574 !preg_match(
'/^data-[^:]*$/i', $attribute ) &&
575 !array_key_exists( $attribute, $allowed )
576 ) || self::isReservedDataAttribute( $attribute ) ) {
580 # Strip javascript "expression" from stylesheets.
582 if ( $attribute ==
'style' ) {
583 $value = self::checkCss( $value );
586 # Escape HTML id attributes
587 if ( $attribute ===
'id' ) {
588 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
591 # Escape HTML id reference lists
592 if ( $attribute ===
'aria-describedby'
593 || $attribute ===
'aria-flowto'
594 || $attribute ===
'aria-labelledby'
595 || $attribute ===
'aria-owns'
597 $value = self::escapeIdReferenceListInternal( $value );
601 if ( $attribute ===
'rel' || $attribute ===
'rev'
603 || $attribute ===
'about' || $attribute ===
'property'
604 || $attribute ===
'resource' || $attribute ===
'datatype'
605 || $attribute ===
'typeof'
607 || $attribute ===
'itemid' || $attribute ===
'itemprop'
608 || $attribute ===
'itemref' || $attribute ===
'itemscope'
609 || $attribute ===
'itemtype'
612 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
617 # NOTE: even though elements using href/src are not allowed directly, supply
618 # validation code that can be used by tag hook handlers, etc
619 if ( $attribute ===
'href' || $attribute ===
'src' || $attribute ===
'poster' ) {
620 if ( !preg_match( $hrefExp, $value ) ) {
626 if ( $attribute ===
'tabindex' && $value !==
'0' ) {
633 $out[$attribute] = $value;
636 # itemtype, itemid, itemref don't make sense without itemscope
637 if ( !array_key_exists(
'itemscope', $out ) ) {
638 unset( $out[
'itemtype'] );
639 unset( $out[
'itemid'] );
640 unset( $out[
'itemref'] );
642 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
662 return (
bool)preg_match(
'/^data-(ooui|mw|parsoid)/i', $attr );
676 $out = array_merge( $a, $b );
677 if ( isset( $a[
'class'] ) && isset( $b[
'class'] )
678 && is_string( $a[
'class'] ) && is_string( $b[
'class'] )
679 && $a[
'class'] !== $b[
'class']
681 $classes = preg_split(
'/\s+/',
"{$a['class']} {$b['class']}",
682 -1, PREG_SPLIT_NO_EMPTY );
683 $out[
'class'] = implode(
' ', array_unique( $classes ) );
698 $value = self::decodeCharReferences( $value );
710 if ( !$decodeRegex ) {
711 $space =
'[\\x20\\t\\r\\n\\f]';
712 $nl =
'(?:\\n|\\r\\n|\\r|\\f)';
714 $decodeRegex =
"/ $backslash
716 ($nl) | # 1. Line continuation
717 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
718 (.) | # 3. backslash cancelling special meaning
719 () | # 4. backslash at end of string
722 $value = preg_replace_callback( $decodeRegex,
723 [ __CLASS__,
'cssDecodeCallback' ], $value );
728 if ( !preg_match(
'! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
739 $commentPos = strpos( $value,
'/*' );
740 if ( $commentPos !==
false ) {
741 $value = substr( $value, 0, $commentPos );
767 $value = self::normalizeCss( $value );
770 if ( preg_match(
'/[\000-\010\013\016-\037\177]/', $value ) ||
771 strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !==
false ) {
772 return '/* invalid control char */';
773 } elseif ( preg_match(
783 | attr\s*\([^)]+[\s,]+url
785 return '/* insecure input */';
794 private static function cssDecodeCallback(
$matches ) {
799 # hexdec could return a float if the match is too long, but the
800 # regexp in question limits the string length to 6.
801 $char = UtfNormal\Utils::codepointToUtf8( hexdec(
$matches[2] ) );
807 if ( $char ==
"\n" || $char ==
'"' || $char ==
"'" || $char ==
'\\' ) {
810 return '\\' . dechex( ord( $char ) ) .
' ';
839 if ( trim( $text ) ==
'' ) {
843 $decoded = self::decodeTagAttributes( $text );
844 $stripped = self::validateTagAttributes( $decoded, $element );
850 return self::safeEncodeTagAttributes( $stripped );
859 $encValue = htmlspecialchars( $text, ENT_QUOTES );
864 $encValue = strtr( $encValue, [
885 # French spaces, last one Guillemet-left
886 # only if it isn't followed by a word character.
887 '/ (?=[?:;!%»›](?!\w))/u' =>
"$space",
888 # French spaces, Guillemet-right
889 '/([«‹]) /u' =>
"\\1$space",
891 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
901 $encValue = self::encodeAttribute( $text );
903 # Templates and links may be expanded in later parsing,
904 # creating invalid or dangerous output. Suppress this.
905 $encValue = strtr( $encValue, [
913 "''" =>
'''',
914 'ISBN' =>
'ISBN',
916 'PMID' =>
'PMID',
922 $encValue = preg_replace_callback(
925 return str_replace(
':',
':',
$matches[1] );
950 if ( $mode === self::ID_PRIMARY ) {
951 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
958 return self::escapeIdInternal( $id, $internalMode );
977 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
982 $id = self::escapeIdInternalUrl( $id, $mode );
1013 private static function escapeIdInternalUrl( $id, $mode ) {
1014 $id = self::escapeIdInternal( $id, $mode );
1015 if ( $mode ===
'html5' ) {
1016 $id = preg_replace(
'/%([a-fA-F0-9]{2})/',
'%25$1', $id );
1028 private static function escapeIdInternal( $id, $mode ) {
1031 $id = mb_substr( $id, 0, 1024 );
1039 $id = str_replace( [
"\t",
"\n",
"\f",
"\r",
" " ],
'_', $id );
1048 $id = urlencode( str_replace(
' ',
'_', $id ) );
1049 $id = strtr( $id, $replace );
1052 throw new InvalidArgumentException(
"Invalid mode '$mode' passed to '" . __METHOD__ );
1070 return self::escapeIdReferenceListInternal( $referenceString );
1080 private static function escapeIdReferenceListInternal( $referenceString ) {
1081 # Explode the space delimited list string into an array of tokens
1082 $references = preg_split(
'/\s+/',
"{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1084 # Escape each token as an id
1085 foreach ( $references as &$ref ) {
1086 $ref = self::escapeIdForAttribute( $ref );
1089 # Merge the array back to a space delimited list string
1090 # If the array is empty, the result will be an empty string ('')
1091 $referenceString = implode(
' ', $references );
1093 return $referenceString;
1109 return rtrim( preg_replace(
1110 [
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/' ],
1123 $html = self::decodeCharReferences( $html );
1124 # It seems wise to escape ' as well as ", as a matter of course. Can't
1125 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1126 # don't cause the entire string to disappear.
1127 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1139 public static function decodeTagAttributes( $text ) {
1140 if ( trim( $text ) == '' ) {
1145 if ( !preg_match_all(
1146 self::getAttribsRegex(),
1149 PREG_SET_ORDER ) ) {
1154 foreach ( $pairs as $set ) {
1155 $attribute = strtolower( $set[1] );
1157 // Filter attribute names with unacceptable characters
1158 if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1162 $value = self::getTagAttributeCallback( $set );
1164 // Normalize whitespace
1165 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1166 $value = trim( $value );
1168 // Decode character references
1169 $attribs[$attribute] = self::decodeCharReferences( $value );
1181 public static function safeEncodeTagAttributes( $assoc_array ) {
1183 foreach ( $assoc_array as $attribute => $value ) {
1184 $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1185 $encValue = self::safeEncodeAttribute( $value );
1187 $attribs[] = "$encAttribute=\"$encValue\"";
1189 return count( $attribs ) ?
' ' . implode(
' ', $attribs ) :
'';
1200 private static function getTagAttributeCallback( $set ) {
1201 if ( isset( $set[5] ) ) {
1204 } elseif ( isset( $set[4] ) ) {
1207 } elseif ( isset( $set[3] ) ) {
1210 } elseif ( !isset( $set[2] ) ) {
1211 # In XHTML, attributes must have a value so return an empty string.
1212 # See "Empty attribute syntax",
1216 throw new MWException(
"Tag conditions not met. This should never happen and is a bug." );
1224 private static function normalizeWhitespace( $text ) {
1225 return trim( preg_replace(
1226 '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1240 return trim( preg_replace(
'/[ _]+/',
' ', $section ) );
1259 return preg_replace_callback(
1260 self::CHAR_REFS_REGEX,
1261 [ self::class,
'normalizeCharReferencesCallback' ],
1269 private static function normalizeCharReferencesCallback(
$matches ) {
1272 $ret = self::normalizeEntity(
$matches[1] );
1274 $ret = self::decCharReference(
$matches[2] );
1276 $ret = self::hexCharReference(
$matches[3] );
1278 if ( $ret ===
null ) {
1279 return htmlspecialchars(
$matches[0], ENT_COMPAT );
1295 private static function normalizeEntity( $name ) {
1296 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1298 return '&' . self::MW_ENTITY_ALIASES[$name];
1299 } elseif ( in_array( $name, [
'lt;',
'gt;',
'amp;',
'quot;' ],
true ) ) {
1302 } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1304 return preg_replace_callback(
'/./Ssu',
static function ( $m ) {
1305 return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) .
';';
1306 }, HTMLData::$namedEntityTranslations[$name] );
1308 return "&$name";
1316 private static function decCharReference( $codepoint ) {
1317 # intval() will (safely) saturate at the maximum signed integer
1318 # value if $codepoint is too many digits
1319 $point = intval( $codepoint );
1320 if ( self::validateCodepoint( $point ) ) {
1321 return sprintf(
'&#%d;', $point );
1331 private static function hexCharReference( $codepoint ) {
1332 # hexdec() will return a float (not an int) if $codepoint is too
1333 # long, so protect against that. The largest valid codepoint is
1335 if ( strlen( ltrim( $codepoint,
'0' ) ) > 6 ) {
1338 $point = hexdec( $codepoint );
1339 if ( self::validateCodepoint( $point ) ) {
1340 return sprintf(
'&#x%x;', $point );
1352 private static function validateCodepoint( $codepoint ) {
1353 # U+000C is valid in HTML5 but not allowed in XML.
1354 # U+000D is valid in XML but not allowed in HTML5.
1355 # U+007F - U+009F are disallowed in HTML5 (control characters).
1356 return $codepoint == 0x09
1357 || $codepoint == 0x0a
1358 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1359 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1360 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1361 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1372 return preg_replace_callback(
1373 self::CHAR_REFS_REGEX,
1374 [ self::class,
'decodeCharReferencesCallback' ],
1389 $text = preg_replace_callback(
1390 self::CHAR_REFS_REGEX,
1391 [ self::class,
'decodeCharReferencesCallback' ],
1398 return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1408 private static function decodeCharReferencesCallback(
$matches ) {
1410 return self::decodeEntity(
$matches[1] );
1412 return self::decodeChar( intval(
$matches[2] ) );
1414 # hexdec will return a float if the string is too long (!) so
1415 # check the length of the string first.
1416 if ( strlen( ltrim(
$matches[3],
'0' ) ) > 6 ) {
1418 return UtfNormal\Constants::UTF8_REPLACEMENT;
1420 return self::decodeChar( hexdec(
$matches[3] ) );
1422 # Last case should be an ampersand by itself
1433 private static function decodeChar( $codepoint ) {
1434 if ( self::validateCodepoint( $codepoint ) ) {
1435 return UtfNormal\Utils::codepointToUtf8( $codepoint );
1437 return UtfNormal\Constants::UTF8_REPLACEMENT;
1449 private static function decodeEntity( $name ) {
1451 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1452 $name = self::MW_ENTITY_ALIASES[$name];
1454 $trans = HTMLData::$namedEntityTranslations[$name] ??
null;
1455 return $trans ??
"&$name";
1465 private static function attributesAllowedInternal( $element ) {
1466 $list = self::setupAttributesAllowedInternal();
1467 return $list[$element] ?? [];
1477 private static function setupAttributesAllowedInternal() {
1480 if ( $allowed !==
null ) {
1486 $merge =
static function ( $a, $b, $c = [] ) {
1489 array_fill_keys( $b,
true ),
1490 array_fill_keys( $c,
true ) );
1492 $common = $merge( [], [
1512 # These attributes are specified in section 9 of
1520 # Microdata. These are specified by
1529 $block = $merge( $common, [
'align' ] );
1531 $tablealign = [
'align',
'valign' ];
1539 'nowrap', # deprecated
1540 'width', # deprecated
1541 'height', # deprecated
1542 'bgcolor', # deprecated
1545 # Numbers refer to sections in HTML 4.01 standard describing the element.
1550 'center' => $common, # deprecated
1569 'strong' => $common,
1580 'blockquote' => $merge( $common, [
'cite' ] ),
1581 'q' => $merge( $common, [
'cite' ] ),
1591 'br' => $merge( $common, [
'clear' ] ),
1597 'pre' => $merge( $common, [
'width' ] ),
1600 'ins' => $merge( $common, [
'cite',
'datetime' ] ),
1601 'del' => $merge( $common, [
'cite',
'datetime' ] ),
1604 'ul' => $merge( $common, [
'type' ] ),
1605 'ol' => $merge( $common, [
'type',
'start',
'reversed' ] ),
1606 'li' => $merge( $common, [
'type',
'value' ] ),
1614 'table' => $merge( $common,
1615 [
'summary',
'width',
'border',
'frame',
1616 'rules',
'cellspacing',
'cellpadding',
1621 'caption' => $block,
1629 'colgroup' => $merge( $common, [
'span' ] ),
1630 'col' => $merge( $common, [
'span' ] ),
1633 'tr' => $merge( $common, [
'bgcolor' ], $tablealign ),
1636 'td' => $merge( $common, $tablecell, $tablealign ),
1637 'th' => $merge( $common, $tablecell, $tablealign ),
1640 # NOTE: <a> is not allowed directly, but this list of allowed
1641 # attributes is used from the Parser object
1642 'a' => $merge( $common, [
'href',
'rel',
'rev' ] ), # rel/rev esp.
for RDFa
1645 # Not usually allowed, but may be used for extension-style hooks
1646 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1648 'img' => $merge( $common, [
'alt',
'src',
'width',
'height',
'srcset' ] ),
1649 # Attributes for A/V tags added in T163583 / T133673
1650 'audio' => $merge( $common, [
'controls',
'preload',
'width',
'height' ] ),
1651 'video' => $merge( $common, [
'poster',
'controls',
'preload',
'width',
'height' ] ),
1652 'source' => $merge( $common, [
'type',
'src' ] ),
1653 'track' => $merge( $common, [
'type',
'src',
'srclang',
'kind',
'label' ] ),
1661 'strike' => $common,
1666 'font' => $merge( $common, [
'size',
'color',
'face' ] ),
1670 'hr' => $merge( $common, [
'width' ] ),
1672 # HTML Ruby annotation text module, simple ruby only.
1678 'rt' => $common, # $merge( $common, [
'rbspan' ] ),
1681 # MathML root element, where used for extensions
1682 # 'title' may not be 100% valid here; it's XHTML
1684 'math' => $merge( [], [
'class',
'style',
'id',
'title' ] ),
1687 'figure' => $common,
1688 'figcaption' => $common,
1690 # HTML 5 section 4.6
1693 # HTML5 elements, defined by:
1695 'data' => $merge( $common, [
'value' ] ),
1696 'time' => $merge( $common, [
'datetime' ] ),
1704 'meta' => $merge( [], [
'itemprop',
'content' ] ),
1705 'link' => $merge( [], [
'itemprop',
'href',
'title' ] ),
1707 # HTML 5 section 4.3.5
1728 $tokenizer =
new RemexTokenizer( $handler, $html, [
1729 'ignoreErrors' =>
true,
1731 'ignoreNulls' =>
true,
1732 'skipPreprocess' =>
true,
1734 $tokenizer->execute();
1735 $text = $handler->getResult();
1737 $text = self::normalizeWhitespace( $text );
1753 $out =
"<!DOCTYPE html [\n";
1754 foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1755 if ( substr( $entity, -1 ) !==
';' ) {
1760 $name = substr( $entity, 0, -1 );
1761 $expansion = self::normalizeEntity( $entity );
1762 if ( $entity === $expansion ) {
1766 $out .=
"<!ENTITY $name \"$expansion\">";
1777 # Normalize any HTML entities in input. They will be
1778 # re-escaped by makeExternalLink().
1779 $url = self::decodeCharReferences( $url );
1781 # Escape any control characters introduced by the above step
1782 $url = preg_replace_callback(
'/[\][<>"\\x00-\\x20\\x7F\|]/',
1783 [ __CLASS__,
'cleanUrlCallback' ], $url );
1785 # Validate hostname portion
1787 if ( preg_match(
'!^([^:]+:)(//[^/]+)?(.*)$!iD', $url,
$matches ) ) {
1788 [ , $protocol, $host, $rest ] =
$matches;
1795 \\s| # general whitespace
1796 \u{00AD}| # SOFT HYPHEN
1797 \u{034F}| # COMBINING GRAPHEME JOINER
1798 \u{061C}| # ARABIC LETTER MARK
1799 [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER..
1800 # HANGUL JUNGSEONG FILLER
1801 [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ..
1802 # KHMER VOWEL INHERENT AA
1803 [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE..
1804 # MONGOLIAN FREE VARIATION SELECTOR THREE
1805 \u{180E}| # MONGOLIAN VOWEL SEPARATOR
1806 [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE..
1807 # RIGHT-TO-LEFT MARK
1808 [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING..
1809 # RIGHT-TO-LEFT OVERRIDE
1810 [\u{2060}-\u{2064}]| # WORD JOINER..
1812 \u{2065}| # <reserved-2065>
1813 [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE..
1814 # NOMINAL DIGIT SHAPES
1815 \u{3164}| # HANGUL FILLER
1816 [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1..
1817 # VARIATION SELECTOR-16
1818 \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE
1819 \u{FFA0}| # HALFWIDTH HANGUL FILLER
1820 [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>..
1822 [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP..
1823 # SHORTHAND FORMAT UP STEP
1824 [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM..
1825 # MUSICAL SYMBOL END PHRASE
1826 \u{E0000}| # <reserved-E0000>
1827 \u{E0001}| # LANGUAGE TAG
1828 [\u{E0002}-\u{E001F}]| # <reserved-E0002>..
1830 [\u{E0020}-\u{E007F}]| # TAG SPACE..
1832 [\u{E0080}-\u{E00FF}]| # <reserved-E0080>..
1834 [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17..
1835 # VARIATION SELECTOR-256
1836 [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>..
1840 $host = preg_replace( $strip,
'', $host );
1843 if ( str_starts_with( $host,
"//%5B" ) &&
1844 preg_match(
'!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host,
$matches )
1851 return $protocol . $host . $rest;
1861 private static function cleanUrlCallback(
$matches ) {
1896 if ( !
Hooks::runner()->onIsValidEmailAddr( $addr, $result ) ) {
1903 $rfc5322_atext =
"a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1904 $rfc1034_ldh_str =
"a-z0-9\\-";
1906 $html5_email_regexp =
"/
1908 [$rfc5322_atext\\.]+ # user part which is liberal :p
1910 [$rfc1034_ldh_str]+ # First domain part
1911 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1915 return (
bool)preg_match( $html5_email_regexp, $addr );
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that a deprecated feature was used.
static runner()
Get a HookRunner instance for calling hooks using the new interfaces.
HTML sanitizer for MediaWiki.
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
static validateAttributes( $attribs, $allowed)
Take an array of attribute names and values and normalize or discard illegal values.
static armorFrenchSpaces( $text, $space=' ')
Armor French spaces with a replacement character.
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static removeHTMLcomments( $text)
Remove '', and everything between.
static encodeAttribute( $text)
Encode an attribute value for HTML output.
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
static removeSomeTags(string $text, array $options=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; the result will alw...
static internalRemoveHtmlTags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
static escapeIdReferenceList( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static validateEmail( $addr)
Does a string look like an e-mail address?
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
$wgAllowImageTag
Config variable stub for the AllowImageTag setting, for use by phpdoc and IDEs.
$wgFragmentMode
Config variable stub for the FragmentMode setting, for use by phpdoc and IDEs.
$wgExternalInterwikiFragmentMode
Config variable stub for the ExternalInterwikiFragmentMode setting, for use by phpdoc and IDEs.