32 use Wikimedia\RemexHtml\HTMLData;
33 use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
34 use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
35 use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
36 use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
49 private const CHAR_REFS_REGEX =
50 '/&([A-Za-z0-9\x80-\xff]+;)
52 |&\#[xX]([0-9A-Fa-f]+);
59 private const ELEMENT_BITS_REGEX =
'!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
70 private const EVIL_URI_PATTERN =
'!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
71 private const XMLNS_ATTRIBUTE_PATTERN =
"/^xmlns:[:A-Z_a-z-.0-9]+$/";
92 private const MW_ENTITY_ALIASES = [
100 private static $attribsRegex;
108 private static function getAttribsRegex() {
109 if ( self::$attribsRegex ===
null ) {
110 $spaceChars =
'\x09\x0a\x0c\x0d\x20';
111 $space =
"[{$spaceChars}]";
112 $attrib =
"[^{$spaceChars}\/>=]";
113 $attribFirst =
"(?:{$attrib}|=)";
114 self::$attribsRegex =
115 "/({$attribFirst}{$attrib}*)
118 # The attribute value: quoted or alone
125 return self::$attribsRegex;
131 private static $attribNameRegex;
137 private static function getAttribNameRegex() {
138 if ( self::$attribNameRegex ===
null ) {
139 $attribFirst =
"[:_\p{L}\p{N}]";
140 $attrib =
"[:_\.\-\p{L}\p{N}]";
141 self::$attribNameRegex =
"/^({$attribFirst}{$attrib}*)$/sxu";
143 return self::$attribNameRegex;
155 static $commonCase, $staticInitialised;
156 $isCommonCase = ( $extratags === [] && $removetags === [] );
157 if ( $staticInitialised ===
$wgAllowImageTag && $isCommonCase && $commonCase ) {
161 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
162 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
167 if ( !$staticInitialised || $staticInitialised !== $globalContext ) {
168 $htmlpairsStatic = [ # Tags that must be closed
169 'b',
'bdi',
'del',
'i',
'ins',
'u',
'font',
'big',
'small',
'sub',
'sup',
'h1',
170 'h2',
'h3',
'h4',
'h5',
'h6',
'cite',
'code',
'em',
's',
171 'strike',
'strong',
'tt',
'var',
'div',
'center',
172 'blockquote',
'ol',
'ul',
'dl',
'table',
'caption',
'pre',
173 'ruby',
'rb',
'rp',
'rt',
'rtc',
'p',
'span',
'abbr',
'dfn',
174 'kbd',
'samp',
'data',
'time',
'mark'
176 # These tags can be self-closed. For tags not also on
177 # $htmlsingleonly, a self-closed tag will be emitted as
178 # an empty element (open-tag/close-tag pair).
180 'br',
'wbr',
'hr',
'li',
'dt',
'dd',
'meta',
'link'
183 # Elements that cannot have close tags. This is (not coincidentally)
184 # also the list of tags for which the HTML 5 parsing algorithm
185 # requires you to "acknowledge the token's self-closing flag", i.e.
186 # a self-closing tag like <br/> is not an HTML 5 parse error only
189 'br',
'wbr',
'hr',
'meta',
'link'
192 $htmlnest = [ # Tags that can be nested--??
193 'table',
'tr',
'td',
'th',
'div',
'blockquote',
'ol',
'ul',
194 'li',
'dl',
'dt',
'dd',
'font',
'big',
'small',
'sub',
'sup',
'span',
195 'var',
'kbd',
'samp',
'em',
'strong',
'q',
'ruby',
'bdo'
197 $tabletags = [ # Can only appear inside table, we will close them
200 $htmllist = [ # Tags used by list
203 $listtags = [ # Tags that can appear in a list
209 'is deprecated since MediaWiki 1.35',
'1.35',
false,
false );
210 $htmlsingle[] =
'img';
211 $htmlsingleonly[] =
'img';
214 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
215 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
217 # Convert them all to hashtables for faster lookup
218 $vars = [
'htmlpairsStatic',
'htmlsingle',
'htmlsingleonly',
'htmlnest',
'tabletags',
219 'htmllist',
'listtags',
'htmlsingleallowed',
'htmlelementsStatic' ];
220 foreach ( $vars as $var ) {
221 $$var = array_fill_keys( $$var,
true );
223 $staticInitialised = $globalContext;
226 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
227 $extratags = array_fill_keys( $extratags,
true );
228 $removetags = array_fill_keys( $removetags,
true );
230 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
232 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
235 'htmlpairs' => $htmlpairs,
236 'htmlsingle' => $htmlsingle,
237 'htmlsingleonly' => $htmlsingleonly,
238 'htmlnest' => $htmlnest,
239 'tabletags' => $tabletags,
240 'htmllist' => $htmllist,
241 'listtags' => $listtags,
242 'htmlsingleallowed' => $htmlsingleallowed,
243 'htmlelements' => $htmlelements,
245 if ( $isCommonCase ) {
246 $commonCase = $result;
282 $args = [], $extratags = [], $removetags = []
286 $text, $processCallback, $args, $extratags, $removetags
319 $args = [], $extratags = [], $removetags = []
322 $htmlsingle = $tagData[
'htmlsingle'];
323 $htmlsingleonly = $tagData[
'htmlsingleonly'];
324 $htmlelements = $tagData[
'htmlelements'];
326 # Remove HTML comments
328 $bits = explode(
'<', $text );
329 $text = str_replace(
'>',
'>', array_shift( $bits ) );
331 # this might be possible using remex tidy itself
332 foreach ( $bits as $x ) {
333 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
334 [ , $slash,
$t, $params, $brace, $rest ] = $regs;
337 $t = strtolower(
$t );
338 if ( isset( $htmlelements[
$t] ) ) {
339 if ( is_callable( $processCallback ) ) {
340 call_user_func_array( $processCallback, [ &$params, $args ] );
343 if ( $brace ==
'/>' && !( isset( $htmlsingle[
$t] ) || isset( $htmlsingleonly[
$t] ) ) ) {
348 if ( !self::validateTag( $params,
$t ) ) {
354 if ( $brace ===
'/>' && !isset( $htmlsingleonly[
$t] ) ) {
355 # Interpret self-closing tags as empty tags even when
356 # HTML 5 would interpret them as start tags. Such input
357 # is commonly seen on Wikimedia wikis with this intention.
361 $rest = str_replace(
'>',
'>', $rest );
362 $text .=
"<$slash$t$newparams$brace$rest";
367 $text .=
'<' . str_replace(
'>',
'>', $x );
394 string $text, array $options = []
396 $extraTags = $options[
'extraTags'] ?? [];
397 $removeTags = $options[
'removeTags'] ?? [];
399 $attrCallback = $options[
'attrCallback'] ??
null;
400 $attrCallbackArgs = $options[
'attrCallbackArgs'] ?? [];
409 $serializer =
new RemexSerializer( $formatter );
410 $treeBuilder =
new RemexTreeBuilder( $serializer, [
411 'ignoreErrors' =>
true,
412 'ignoreNulls' =>
true,
414 $dispatcher =
new RemexDispatcher( $treeBuilder );
415 $tokenHandler = $dispatcher;
417 $tokenHandler, $text, $tagData,
418 $attrCallback, $attrCallbackArgs
420 $tokenizer =
new RemexTokenizer( $remover, $text, [
421 'ignoreErrors' =>
true,
423 'ignoreNulls' =>
true,
424 'skipPreprocess' =>
true,
426 $tokenizer->execute( [
427 'fragmentNamespace' => HTMLData::NS_HTML,
428 'fragmentName' =>
'body',
430 return $serializer->getResult();
443 while ( ( $start = strpos( $text,
'<!--' ) ) !==
false ) {
444 $end = strpos( $text,
'-->', $start + 4 );
445 if ( $end ===
false ) {
446 # Unterminated comment; bail out
452 # Trim space and newline if the comment is both
453 # preceded and followed by a newline
454 $spaceStart = max( $start - 1, 0 );
455 $spaceLen = $end - $spaceStart;
456 while ( substr( $text, $spaceStart, 1 ) ===
' ' && $spaceStart > 0 ) {
460 while ( substr( $text, $spaceStart + $spaceLen, 1 ) ===
' ' ) {
463 if ( substr( $text, $spaceStart, 1 ) ===
"\n"
464 && substr( $text, $spaceStart + $spaceLen, 1 ) ===
"\n" ) {
465 # Remove the comment, leading and trailing
466 # spaces, and leave only one newline.
467 $text = substr_replace( $text,
"\n", $spaceStart, $spaceLen + 1 );
469 # Remove just the comment.
470 $text = substr_replace( $text,
'', $start, $end - $start );
490 private static function validateTag( $params, $element ) {
491 $params = self::decodeTagAttributes( $params );
493 if ( $element ==
'meta' || $element ==
'link' ) {
494 if ( !isset( $params[
'itemprop'] ) ) {
498 if ( $element ==
'meta' && !isset( $params[
'content'] ) ) {
502 if ( $element ==
'link' && !isset( $params[
'href'] ) ) {
527 return self::validateAttributes( $attribs,
528 self::attributesAllowedInternal( $element ) );
550 if ( isset( $allowed[0] ) ) {
553 wfDeprecated( __METHOD__ .
' with sequential array',
'1.35' );
554 $allowed = array_fill_keys( $allowed,
true );
559 foreach ( $attribs as $attribute => $value ) {
560 # Allow XML namespace declaration to allow RDFa
561 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
562 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
563 $out[$attribute] = $value;
569 # Allow any attribute beginning with "data-"
571 # * Disallow data attributes used by MediaWiki code
572 # * Ensure that the attribute is not namespaced by banning
575 !preg_match(
'/^data-[^:]*$/i', $attribute ) &&
576 !array_key_exists( $attribute, $allowed )
577 ) || self::isReservedDataAttribute( $attribute ) ) {
581 # Strip javascript "expression" from stylesheets.
583 if ( $attribute ==
'style' ) {
584 $value = self::checkCss( $value );
587 # Escape HTML id attributes
588 if ( $attribute ===
'id' ) {
589 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
592 # Escape HTML id reference lists
593 if ( $attribute ===
'aria-describedby'
594 || $attribute ===
'aria-flowto'
595 || $attribute ===
'aria-labelledby'
596 || $attribute ===
'aria-owns'
598 $value = self::escapeIdReferenceListInternal( $value );
602 if ( $attribute ===
'rel' || $attribute ===
'rev'
604 || $attribute ===
'about' || $attribute ===
'property'
605 || $attribute ===
'resource' || $attribute ===
'datatype'
606 || $attribute ===
'typeof'
608 || $attribute ===
'itemid' || $attribute ===
'itemprop'
609 || $attribute ===
'itemref' || $attribute ===
'itemscope'
610 || $attribute ===
'itemtype'
613 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
618 # NOTE: even though elements using href/src are not allowed directly, supply
619 # validation code that can be used by tag hook handlers, etc
620 if ( $attribute ===
'href' || $attribute ===
'src' || $attribute ===
'poster' ) {
621 if ( !preg_match( $hrefExp, $value ) ) {
627 if ( $attribute ===
'tabindex' && $value !==
'0' ) {
634 $out[$attribute] = $value;
637 # itemtype, itemid, itemref don't make sense without itemscope
638 if ( !array_key_exists(
'itemscope', $out ) ) {
639 unset( $out[
'itemtype'] );
640 unset( $out[
'itemid'] );
641 unset( $out[
'itemref'] );
643 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
663 return (
bool)preg_match(
'/^data-(ooui|mw|parsoid)/i', $attr );
677 $out = array_merge( $a, $b );
678 if ( isset( $a[
'class'] ) && isset( $b[
'class'] )
679 && is_string( $a[
'class'] ) && is_string( $b[
'class'] )
680 && $a[
'class'] !== $b[
'class']
682 $classes = preg_split(
'/\s+/',
"{$a['class']} {$b['class']}",
683 -1, PREG_SPLIT_NO_EMPTY );
684 $out[
'class'] = implode(
' ', array_unique( $classes ) );
699 $value = self::decodeCharReferences( $value );
711 if ( !$decodeRegex ) {
712 $space =
'[\\x20\\t\\r\\n\\f]';
713 $nl =
'(?:\\n|\\r\\n|\\r|\\f)';
715 $decodeRegex =
"/ $backslash
717 ($nl) | # 1. Line continuation
718 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
719 (.) | # 3. backslash cancelling special meaning
720 () | # 4. backslash at end of string
723 $value = preg_replace_callback( $decodeRegex,
724 [ __CLASS__,
'cssDecodeCallback' ], $value );
729 if ( !preg_match(
'! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
740 $commentPos = strpos( $value,
'/*' );
741 if ( $commentPos !==
false ) {
742 $value = substr( $value, 0, $commentPos );
768 $value = self::normalizeCss( $value );
771 if ( preg_match(
'/[\000-\010\013\016-\037\177]/', $value ) ||
772 strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !==
false ) {
773 return '/* invalid control char */';
774 } elseif ( preg_match(
784 | attr\s*\([^)]+[\s,]+url
786 return '/* insecure input */';
795 private static function cssDecodeCallback(
$matches ) {
800 # hexdec could return a float if the match is too long, but the
801 # regexp in question limits the string length to 6.
802 $char = UtfNormal\Utils::codepointToUtf8( hexdec(
$matches[2] ) );
808 if ( $char ==
"\n" || $char ==
'"' || $char ==
"'" || $char ==
'\\' ) {
811 return '\\' . dechex( ord( $char ) ) .
' ';
840 if ( trim( $text ) ==
'' ) {
844 $decoded = self::decodeTagAttributes( $text );
845 $stripped = self::validateTagAttributes( $decoded, $element );
851 return self::safeEncodeTagAttributes( $stripped );
860 $encValue = htmlspecialchars( $text, ENT_QUOTES );
865 $encValue = strtr( $encValue, [
886 # French spaces, last one Guillemet-left
887 # only if it isn't followed by a word character.
888 '/ (?=[?:;!%»›](?!\w))/u' =>
"$space",
889 # French spaces, Guillemet-right
890 '/([«‹]) /u' =>
"\\1$space",
892 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
902 $encValue = self::encodeAttribute( $text );
904 # Templates and links may be expanded in later parsing,
905 # creating invalid or dangerous output. Suppress this.
906 $encValue = strtr( $encValue, [
914 "''" =>
'''',
915 'ISBN' =>
'ISBN',
917 'PMID' =>
'PMID',
923 $encValue = preg_replace_callback(
926 return str_replace(
':',
':',
$matches[1] );
951 if ( $mode === self::ID_PRIMARY ) {
952 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
959 return self::escapeIdInternal( $id, $internalMode );
978 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
983 $id = self::escapeIdInternalUrl( $id, $mode );
1014 private static function escapeIdInternalUrl( $id, $mode ) {
1015 $id = self::escapeIdInternal( $id, $mode );
1016 if ( $mode ===
'html5' ) {
1017 $id = preg_replace(
'/%([a-fA-F0-9]{2})/',
'%25$1', $id );
1029 private static function escapeIdInternal( $id, $mode ) {
1032 $id = mb_substr( $id, 0, 1024 );
1040 $id = str_replace( [
"\t",
"\n",
"\f",
"\r",
" " ],
'_', $id );
1049 $id = urlencode( str_replace(
' ',
'_', $id ) );
1050 $id = strtr( $id, $replace );
1053 throw new InvalidArgumentException(
"Invalid mode '$mode' passed to '" . __METHOD__ );
1071 return self::escapeIdReferenceListInternal( $referenceString );
1081 private static function escapeIdReferenceListInternal( $referenceString ) {
1082 # Explode the space delimited list string into an array of tokens
1083 $references = preg_split(
'/\s+/',
"{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1085 # Escape each token as an id
1086 foreach ( $references as &$ref ) {
1087 $ref = self::escapeIdForAttribute( $ref );
1090 # Merge the array back to a space delimited list string
1091 # If the array is empty, the result will be an empty string ('')
1092 $referenceString = implode(
' ', $references );
1094 return $referenceString;
1110 return rtrim( preg_replace(
1111 [
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/' ],
1124 $html = self::decodeCharReferences( $html );
1125 # It seems wise to escape ' as well as ", as a matter of course. Can't
1126 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1127 # don't cause the entire string to disappear.
1128 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1140 public static function decodeTagAttributes( $text ) {
1141 if ( trim( $text ) == '' ) {
1146 if ( !preg_match_all(
1147 self::getAttribsRegex(),
1150 PREG_SET_ORDER ) ) {
1155 foreach ( $pairs as $set ) {
1156 $attribute = strtolower( $set[1] );
1158 // Filter attribute names with unacceptable characters
1159 if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1163 $value = self::getTagAttributeCallback( $set );
1165 // Normalize whitespace
1166 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1167 $value = trim( $value );
1169 // Decode character references
1170 $attribs[$attribute] = self::decodeCharReferences( $value );
1182 public static function safeEncodeTagAttributes( $assoc_array ) {
1184 foreach ( $assoc_array as $attribute => $value ) {
1185 $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1186 $encValue = self::safeEncodeAttribute( $value );
1188 $attribs[] = "$encAttribute=\"$encValue\"";
1190 return count( $attribs ) ?
' ' . implode(
' ', $attribs ) :
'';
1201 private static function getTagAttributeCallback( $set ) {
1202 if ( isset( $set[5] ) ) {
1205 } elseif ( isset( $set[4] ) ) {
1208 } elseif ( isset( $set[3] ) ) {
1211 } elseif ( !isset( $set[2] ) ) {
1212 # In XHTML, attributes must have a value so return an empty string.
1213 # See "Empty attribute syntax",
1217 throw new MWException(
"Tag conditions not met. This should never happen and is a bug." );
1225 private static function normalizeWhitespace( $text ) {
1226 return trim( preg_replace(
1227 '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1241 return trim( preg_replace(
'/[ _]+/',
' ', $section ) );
1260 return preg_replace_callback(
1261 self::CHAR_REFS_REGEX,
1262 [ self::class,
'normalizeCharReferencesCallback' ],
1270 private static function normalizeCharReferencesCallback(
$matches ) {
1273 $ret = self::normalizeEntity(
$matches[1] );
1275 $ret = self::decCharReference(
$matches[2] );
1277 $ret = self::hexCharReference(
$matches[3] );
1279 if ( $ret ===
null ) {
1280 return htmlspecialchars(
$matches[0], ENT_COMPAT );
1296 private static function normalizeEntity( $name ) {
1297 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1299 return '&' . self::MW_ENTITY_ALIASES[$name];
1300 } elseif ( in_array( $name, [
'lt;',
'gt;',
'amp;',
'quot;' ],
true ) ) {
1303 } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1305 return preg_replace_callback(
'/./Ssu',
static function ( $m ) {
1306 return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) .
';';
1307 }, HTMLData::$namedEntityTranslations[$name] );
1309 return "&$name";
1317 private static function decCharReference( $codepoint ) {
1318 # intval() will (safely) saturate at the maximum signed integer
1319 # value if $codepoint is too many digits
1320 $point = intval( $codepoint );
1321 if ( self::validateCodepoint( $point ) ) {
1322 return sprintf(
'&#%d;', $point );
1332 private static function hexCharReference( $codepoint ) {
1333 # hexdec() will return a float (not an int) if $codepoint is too
1334 # long, so protect against that. The largest valid codepoint is
1336 if ( strlen( ltrim( $codepoint,
'0' ) ) > 6 ) {
1339 $point = hexdec( $codepoint );
1340 if ( self::validateCodepoint( $point ) ) {
1341 return sprintf(
'&#x%x;', $point );
1353 private static function validateCodepoint( $codepoint ) {
1354 # U+000C is valid in HTML5 but not allowed in XML.
1355 # U+000D is valid in XML but not allowed in HTML5.
1356 # U+007F - U+009F are disallowed in HTML5 (control characters).
1357 return $codepoint == 0x09
1358 || $codepoint == 0x0a
1359 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1360 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1361 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1362 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1373 return preg_replace_callback(
1374 self::CHAR_REFS_REGEX,
1375 [ self::class,
'decodeCharReferencesCallback' ],
1390 $text = preg_replace_callback(
1391 self::CHAR_REFS_REGEX,
1392 [ self::class,
'decodeCharReferencesCallback' ],
1399 return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1409 private static function decodeCharReferencesCallback(
$matches ) {
1411 return self::decodeEntity(
$matches[1] );
1413 return self::decodeChar( intval(
$matches[2] ) );
1415 # hexdec will return a float if the string is too long (!) so
1416 # check the length of the string first.
1417 if ( strlen( ltrim(
$matches[3],
'0' ) ) > 6 ) {
1419 return UtfNormal\Constants::UTF8_REPLACEMENT;
1421 return self::decodeChar( hexdec(
$matches[3] ) );
1423 # Last case should be an ampersand by itself
1434 private static function decodeChar( $codepoint ) {
1435 if ( self::validateCodepoint( $codepoint ) ) {
1436 return UtfNormal\Utils::codepointToUtf8( $codepoint );
1438 return UtfNormal\Constants::UTF8_REPLACEMENT;
1450 private static function decodeEntity( $name ) {
1452 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1453 $name = self::MW_ENTITY_ALIASES[$name];
1455 $trans = HTMLData::$namedEntityTranslations[$name] ??
null;
1456 return $trans ??
"&$name";
1466 private static function attributesAllowedInternal( $element ) {
1467 $list = self::setupAttributesAllowedInternal();
1468 return $list[$element] ?? [];
1478 private static function setupAttributesAllowedInternal() {
1481 if ( $allowed !==
null ) {
1487 $merge =
static function ( $a, $b, $c = [] ) {
1490 array_fill_keys( $b,
true ),
1491 array_fill_keys( $c,
true ) );
1493 $common = $merge( [], [
1514 # These attributes are specified in section 9 of
1522 # Microdata. These are specified by
1531 $block = $merge( $common, [
'align' ] );
1533 $tablealign = [
'align',
'valign' ];
1541 'nowrap', # deprecated
1542 'width', # deprecated
1543 'height', # deprecated
1544 'bgcolor', # deprecated
1547 # Numbers refer to sections in HTML 4.01 standard describing the element.
1552 'center' => $common, # deprecated
1571 'strong' => $common,
1582 'blockquote' => $merge( $common, [
'cite' ] ),
1583 'q' => $merge( $common, [
'cite' ] ),
1593 'br' => $merge( $common, [
'clear' ] ),
1599 'pre' => $merge( $common, [
'width' ] ),
1602 'ins' => $merge( $common, [
'cite',
'datetime' ] ),
1603 'del' => $merge( $common, [
'cite',
'datetime' ] ),
1606 'ul' => $merge( $common, [
'type' ] ),
1607 'ol' => $merge( $common, [
'type',
'start',
'reversed' ] ),
1608 'li' => $merge( $common, [
'type',
'value' ] ),
1616 'table' => $merge( $common,
1617 [
'summary',
'width',
'border',
'frame',
1618 'rules',
'cellspacing',
'cellpadding',
1623 'caption' => $block,
1631 'colgroup' => $merge( $common, [
'span' ] ),
1632 'col' => $merge( $common, [
'span' ] ),
1635 'tr' => $merge( $common, [
'bgcolor' ], $tablealign ),
1638 'td' => $merge( $common, $tablecell, $tablealign ),
1639 'th' => $merge( $common, $tablecell, $tablealign ),
1642 # NOTE: <a> is not allowed directly, but this list of allowed
1643 # attributes is used from the Parser object
1644 'a' => $merge( $common, [
'href',
'rel',
'rev' ] ), # rel/rev esp.
for RDFa
1647 # Not usually allowed, but may be used for extension-style hooks
1648 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1650 'img' => $merge( $common, [
'alt',
'src',
'width',
'height',
'srcset' ] ),
1651 # Attributes for A/V tags added in T163583 / T133673
1652 'audio' => $merge( $common, [
'controls',
'preload',
'width',
'height' ] ),
1653 'video' => $merge( $common, [
'poster',
'controls',
'preload',
'width',
'height' ] ),
1654 'source' => $merge( $common, [
'type',
'src' ] ),
1655 'track' => $merge( $common, [
'type',
'src',
'srclang',
'kind',
'label' ] ),
1663 'strike' => $common,
1668 'font' => $merge( $common, [
'size',
'color',
'face' ] ),
1672 'hr' => $merge( $common, [
'width' ] ),
1674 # HTML Ruby annotation text module, simple ruby only.
1680 'rt' => $common, # $merge( $common, [
'rbspan' ] ),
1683 # MathML root element, where used for extensions
1684 # 'title' may not be 100% valid here; it's XHTML
1686 'math' => $merge( [], [
'class',
'style',
'id',
'title' ] ),
1689 'figure' => $common,
1690 'figcaption' => $common,
1692 # HTML 5 section 4.6
1695 # HTML5 elements, defined by:
1697 'data' => $merge( $common, [
'value' ] ),
1698 'time' => $merge( $common, [
'datetime' ] ),
1706 'meta' => $merge( [], [
'itemprop',
'content' ] ),
1707 'link' => $merge( [], [
'itemprop',
'href',
'title' ] ),
1709 # HTML 5 section 4.3.5
1730 $tokenizer =
new RemexTokenizer( $handler, $html, [
1731 'ignoreErrors' =>
true,
1733 'ignoreNulls' =>
true,
1734 'skipPreprocess' =>
true,
1736 $tokenizer->execute();
1737 $text = $handler->getResult();
1739 $text = self::normalizeWhitespace( $text );
1755 $out =
"<!DOCTYPE html [\n";
1756 foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1757 if ( substr( $entity, -1 ) !==
';' ) {
1762 $name = substr( $entity, 0, -1 );
1763 $expansion = self::normalizeEntity( $entity );
1764 if ( $entity === $expansion ) {
1768 $out .=
"<!ENTITY $name \"$expansion\">";
1779 # Normalize any HTML entities in input. They will be
1780 # re-escaped by makeExternalLink().
1781 $url = self::decodeCharReferences( $url );
1783 # Escape any control characters introduced by the above step
1784 $url = preg_replace_callback(
'/[\][<>"\\x00-\\x20\\x7F\|]/',
1785 [ __CLASS__,
'cleanUrlCallback' ], $url );
1787 # Validate hostname portion
1789 if ( preg_match(
'!^([^:]+:)(//[^/]+)?(.*)$!iD', $url,
$matches ) ) {
1790 [ , $protocol, $host, $rest ] =
$matches;
1797 \\s| # general whitespace
1798 \u{00AD}| # SOFT HYPHEN
1799 \u{034F}| # COMBINING GRAPHEME JOINER
1800 \u{061C}| # ARABIC LETTER MARK
1801 [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER..
1802 # HANGUL JUNGSEONG FILLER
1803 [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ..
1804 # KHMER VOWEL INHERENT AA
1805 [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE..
1806 # MONGOLIAN FREE VARIATION SELECTOR THREE
1807 \u{180E}| # MONGOLIAN VOWEL SEPARATOR
1808 [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE..
1809 # RIGHT-TO-LEFT MARK
1810 [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING..
1811 # RIGHT-TO-LEFT OVERRIDE
1812 [\u{2060}-\u{2064}]| # WORD JOINER..
1814 \u{2065}| # <reserved-2065>
1815 [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE..
1816 # NOMINAL DIGIT SHAPES
1817 \u{3164}| # HANGUL FILLER
1818 [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1..
1819 # VARIATION SELECTOR-16
1820 \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE
1821 \u{FFA0}| # HALFWIDTH HANGUL FILLER
1822 [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>..
1824 [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP..
1825 # SHORTHAND FORMAT UP STEP
1826 [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM..
1827 # MUSICAL SYMBOL END PHRASE
1828 \u{E0000}| # <reserved-E0000>
1829 \u{E0001}| # LANGUAGE TAG
1830 [\u{E0002}-\u{E001F}]| # <reserved-E0002>..
1832 [\u{E0020}-\u{E007F}]| # TAG SPACE..
1834 [\u{E0080}-\u{E00FF}]| # <reserved-E0080>..
1836 [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17..
1837 # VARIATION SELECTOR-256
1838 [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>..
1842 $host = preg_replace( $strip,
'', $host );
1845 if ( str_starts_with( $host,
"//%5B" ) &&
1846 preg_match(
'!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host,
$matches )
1853 return $protocol . $host . $rest;
1863 private static function cleanUrlCallback(
$matches ) {
1898 $hookRunner =
new HookRunner( MediaWikiServices::getInstance()->getHookContainer() );
1899 if ( !$hookRunner->onIsValidEmailAddr( $addr, $result ) ) {
1906 $rfc5322_atext =
"a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1907 $rfc1034_ldh_str =
"a-z0-9\\-";
1909 $html5_email_regexp =
"/
1911 [$rfc5322_atext\\.]+ # user part which is liberal :p
1913 [$rfc1034_ldh_str]+ # First domain part
1914 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1918 return (
bool)preg_match( $html5_email_regexp, $addr );
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that a deprecated feature was used.
HTML sanitizer for MediaWiki.
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
static validateAttributes( $attribs, $allowed)
Take an array of attribute names and values and normalize or discard illegal values.
static armorFrenchSpaces( $text, $space=' ')
Armor French spaces with a replacement character.
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static removeHTMLcomments( $text)
Remove '', and everything between.
static encodeAttribute( $text)
Encode an attribute value for HTML output.
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
static removeSomeTags(string $text, array $options=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; the result will alw...
static internalRemoveHtmlTags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
static escapeIdReferenceList( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static validateEmail( $addr)
Does a string look like an e-mail address?
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
$wgAllowImageTag
Config variable stub for the AllowImageTag setting, for use by phpdoc and IDEs.
$wgFragmentMode
Config variable stub for the FragmentMode setting, for use by phpdoc and IDEs.
$wgExternalInterwikiFragmentMode
Config variable stub for the ExternalInterwikiFragmentMode setting, for use by phpdoc and IDEs.