31use Wikimedia\RemexHtml\HTMLData;
32use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
33use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
34use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
35use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
48 private const CHAR_REFS_REGEX =
49 '/&([A-Za-z0-9\x80-\xff]+;)
51 |&\#[xX]([0-9A-Fa-f]+);
58 private const ELEMENT_BITS_REGEX =
'!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
69 private const EVIL_URI_PATTERN =
'!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
70 private const XMLNS_ATTRIBUTE_PATTERN =
"/^xmlns:[:A-Z_a-z-.0-9]+$/";
91 private const MW_ENTITY_ALIASES = [
99 private static $attribsRegex;
107 private static function getAttribsRegex() {
108 if ( self::$attribsRegex ===
null ) {
109 $spaceChars =
'\x09\x0a\x0c\x0d\x20';
110 $space =
"[{$spaceChars}]";
111 $attrib =
"[^{$spaceChars}\/>=]";
112 $attribFirst =
"(?:{$attrib}|=)";
113 self::$attribsRegex =
114 "/({$attribFirst}{$attrib}*)
117 # The attribute value: quoted or alone
124 return self::$attribsRegex;
130 private static $attribNameRegex;
136 private static function getAttribNameRegex() {
137 if ( self::$attribNameRegex ===
null ) {
138 $attribFirst =
"[:_\p{L}\p{N}]";
139 $attrib =
"[:_\.\-\p{L}\p{N}]";
140 self::$attribNameRegex =
"/^({$attribFirst}{$attrib}*)$/sxu";
142 return self::$attribNameRegex;
154 static $commonCase, $staticInitialised;
155 $isCommonCase = ( $extratags === [] && $removetags === [] );
156 if ( $staticInitialised ===
$wgAllowImageTag && $isCommonCase && $commonCase ) {
160 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
161 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
166 if ( !$staticInitialised || $staticInitialised !== $globalContext ) {
167 $htmlpairsStatic = [ # Tags that must be closed
168 'b',
'bdi',
'del',
'i',
'ins',
'u',
'font',
'big',
'small',
'sub',
'sup',
'h1',
169 'h2',
'h3',
'h4',
'h5',
'h6',
'cite',
'code',
'em',
's',
170 'strike',
'strong',
'tt',
'var',
'div',
'center',
171 'blockquote',
'ol',
'ul',
'dl',
'table',
'caption',
'pre',
172 'ruby',
'rb',
'rp',
'rt',
'rtc',
'p',
'span',
'abbr',
'dfn',
173 'kbd',
'samp',
'data',
'time',
'mark'
175 # These tags can be self-closed. For tags not also on
176 # $htmlsingleonly, a self-closed tag will be emitted as
177 # an empty element (open-tag/close-tag pair).
179 'br',
'wbr',
'hr',
'li',
'dt',
'dd',
'meta',
'link'
182 # Elements that cannot have close tags. This is (not coincidentally)
183 # also the list of tags for which the HTML 5 parsing algorithm
184 # requires you to "acknowledge the token's self-closing flag", i.e.
185 # a self-closing tag like <br/> is not an HTML 5 parse error only
188 'br',
'wbr',
'hr',
'meta',
'link'
191 $htmlnest = [ # Tags that can be nested--??
192 'table',
'tr',
'td',
'th',
'div',
'blockquote',
'ol',
'ul',
193 'li',
'dl',
'dt',
'dd',
'font',
'big',
'small',
'sub',
'sup',
'span',
194 'var',
'kbd',
'samp',
'em',
'strong',
'q',
'ruby',
'bdo'
196 $tabletags = [ # Can only appear inside table, we will close them
199 $htmllist = [ # Tags used by list
202 $listtags = [ # Tags that can appear in a list
208 'is deprecated since MediaWiki 1.35',
'1.35',
false,
false );
209 $htmlsingle[] =
'img';
210 $htmlsingleonly[] =
'img';
213 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
214 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
216 # Convert them all to hashtables for faster lookup
217 $vars = [
'htmlpairsStatic',
'htmlsingle',
'htmlsingleonly',
'htmlnest',
'tabletags',
218 'htmllist',
'listtags',
'htmlsingleallowed',
'htmlelementsStatic' ];
219 foreach ( $vars as $var ) {
220 $$var = array_fill_keys( $$var,
true );
222 $staticInitialised = $globalContext;
225 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
226 $extratags = array_fill_keys( $extratags,
true );
227 $removetags = array_fill_keys( $removetags,
true );
229 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
231 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
234 'htmlpairs' => $htmlpairs,
235 'htmlsingle' => $htmlsingle,
236 'htmlsingleonly' => $htmlsingleonly,
237 'htmlnest' => $htmlnest,
238 'tabletags' => $tabletags,
239 'htmllist' => $htmllist,
240 'listtags' => $listtags,
241 'htmlsingleallowed' => $htmlsingleallowed,
242 'htmlelements' => $htmlelements,
244 if ( $isCommonCase ) {
245 $commonCase = $result;
281 $args = [], $extratags = [], $removetags = []
284 return self::internalRemoveHtmlTags(
285 $text, $processCallback,
$args, $extratags, $removetags
318 $args = [], $extratags = [], $removetags = []
320 $tagData = self::getRecognizedTagData( $extratags, $removetags );
321 $htmlsingle = $tagData[
'htmlsingle'];
322 $htmlsingleonly = $tagData[
'htmlsingleonly'];
323 $htmlelements = $tagData[
'htmlelements'];
325 # Remove HTML comments
326 $text = self::removeHTMLcomments( $text );
327 $bits = explode(
'<', $text );
328 $text = str_replace(
'>',
'>', array_shift( $bits ) );
330 # this might be possible using remex tidy itself
331 foreach ( $bits as $x ) {
332 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
333 list( , $slash,
$t, $params, $brace, $rest ) = $regs;
336 $t = strtolower(
$t );
337 if ( isset( $htmlelements[
$t] ) ) {
338 if ( is_callable( $processCallback ) ) {
339 call_user_func_array( $processCallback, [ &$params,
$args ] );
342 if ( $brace ==
'/>' && !( isset( $htmlsingle[
$t] ) || isset( $htmlsingleonly[
$t] ) ) ) {
347 if ( !self::validateTag( $params,
$t ) ) {
351 $newparams = self::fixTagAttributes( $params,
$t );
353 if ( $brace ===
'/>' && !isset( $htmlsingleonly[
$t] ) ) {
354 # Interpret self-closing tags as empty tags even when
355 # HTML 5 would interpret them as start tags. Such input
356 # is commonly seen on Wikimedia wikis with this intention.
360 $rest = str_replace(
'>',
'>', $rest );
361 $text .=
"<$slash$t$newparams$brace$rest";
366 $text .=
'<' . str_replace(
'>',
'>', $x );
393 string $text, array $options = []
395 $extraTags = $options[
'extraTags'] ?? [];
396 $removeTags = $options[
'removeTags'] ?? [];
398 $attrCallback = $options[
'attrCallback'] ??
null;
399 $attrCallbackArgs = $options[
'attrCallbackArgs'] ?? [];
403 $text = self::normalizeCharReferences( $text );
405 $tagData = self::getRecognizedTagData( $extraTags, $removeTags );
408 $serializer =
new RemexSerializer( $formatter );
409 $treeBuilder =
new RemexTreeBuilder( $serializer, [
410 'ignoreErrors' =>
true,
411 'ignoreNulls' =>
true,
413 $dispatcher =
new RemexDispatcher( $treeBuilder );
414 $tokenHandler = $dispatcher;
416 $tokenHandler, $text, $tagData,
417 $attrCallback, $attrCallbackArgs
419 $tokenizer =
new RemexTokenizer( $remover, $text, [
420 'ignoreErrors' =>
true,
422 'ignoreNulls' =>
true,
423 'skipPreprocess' =>
true,
425 $tokenizer->execute( [
426 'fragmentNamespace' => HTMLData::NS_HTML,
427 'fragmentName' =>
'body',
429 return $serializer->getResult();
442 while ( ( $start = strpos( $text,
'<!--' ) ) !==
false ) {
443 $end = strpos( $text,
'-->', $start + 4 );
444 if ( $end ===
false ) {
445 # Unterminated comment; bail out
451 # Trim space and newline if the comment is both
452 # preceded and followed by a newline
453 $spaceStart = max( $start - 1, 0 );
454 $spaceLen = $end - $spaceStart;
455 while ( substr( $text, $spaceStart, 1 ) ===
' ' && $spaceStart > 0 ) {
459 while ( substr( $text, $spaceStart + $spaceLen, 1 ) ===
' ' ) {
462 if ( substr( $text, $spaceStart, 1 ) ===
"\n"
463 && substr( $text, $spaceStart + $spaceLen, 1 ) ===
"\n" ) {
464 # Remove the comment, leading and trailing
465 # spaces, and leave only one newline.
466 $text = substr_replace( $text,
"\n", $spaceStart, $spaceLen + 1 );
468 # Remove just the comment.
469 $text = substr_replace( $text,
'', $start, $end - $start );
489 private static function validateTag( $params, $element ) {
490 $params = self::decodeTagAttributes( $params );
492 if ( $element ==
'meta' || $element ==
'link' ) {
493 if ( !isset( $params[
'itemprop'] ) ) {
497 if ( $element ==
'meta' && !isset( $params[
'content'] ) ) {
501 if ( $element ==
'link' && !isset( $params[
'href'] ) ) {
526 return self::validateAttributes( $attribs,
527 self::attributesAllowedInternal( $element ) );
549 if ( isset( $allowed[0] ) ) {
552 wfDeprecated( __METHOD__ .
' with sequential array',
'1.35' );
553 $allowed = array_fill_keys( $allowed,
true );
558 foreach ( $attribs as $attribute => $value ) {
559 # Allow XML namespace declaration to allow RDFa
560 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
561 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
562 $out[$attribute] = $value;
568 # Allow any attribute beginning with "data-"
570 # * Disallow data attributes used by MediaWiki code
571 # * Ensure that the attribute is not namespaced by banning
574 !preg_match(
'/^data-[^:]*$/i', $attribute ) &&
575 !array_key_exists( $attribute, $allowed )
576 ) || self::isReservedDataAttribute( $attribute ) ) {
580 # Strip javascript "expression" from stylesheets.
582 if ( $attribute ==
'style' ) {
583 $value = self::checkCss( $value );
586 # Escape HTML id attributes
587 if ( $attribute ===
'id' ) {
588 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
591 # Escape HTML id reference lists
592 if ( $attribute ===
'aria-describedby'
593 || $attribute ===
'aria-flowto'
594 || $attribute ===
'aria-labelledby'
595 || $attribute ===
'aria-owns'
597 $value = self::escapeIdReferenceListInternal( $value );
601 if ( $attribute ===
'rel' || $attribute ===
'rev'
603 || $attribute ===
'about' || $attribute ===
'property'
604 || $attribute ===
'resource' || $attribute ===
'datatype'
605 || $attribute ===
'typeof'
607 || $attribute ===
'itemid' || $attribute ===
'itemprop'
608 || $attribute ===
'itemref' || $attribute ===
'itemscope'
609 || $attribute ===
'itemtype'
612 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
617 # NOTE: even though elements using href/src are not allowed directly, supply
618 # validation code that can be used by tag hook handlers, etc
619 if ( $attribute ===
'href' || $attribute ===
'src' || $attribute ===
'poster' ) {
620 if ( !preg_match( $hrefExp, $value ) ) {
626 if ( $attribute ===
'tabindex' && $value !==
'0' ) {
633 $out[$attribute] = $value;
636 # itemtype, itemid, itemref don't make sense without itemscope
637 if ( !array_key_exists(
'itemscope', $out ) ) {
638 unset( $out[
'itemtype'] );
639 unset( $out[
'itemid'] );
640 unset( $out[
'itemref'] );
642 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
662 return (
bool)preg_match(
'/^data-(ooui|mw|parsoid)/i', $attr );
676 $out = array_merge( $a, $b );
677 if ( isset( $a[
'class'] ) && isset( $b[
'class'] )
678 && is_string( $a[
'class'] ) && is_string( $b[
'class'] )
679 && $a[
'class'] !== $b[
'class']
681 $classes = preg_split(
'/\s+/',
"{$a['class']} {$b['class']}",
682 -1, PREG_SPLIT_NO_EMPTY );
683 $out[
'class'] = implode(
' ', array_unique( $classes ) );
698 $value = self::decodeCharReferences( $value );
710 if ( !$decodeRegex ) {
711 $space =
'[\\x20\\t\\r\\n\\f]';
712 $nl =
'(?:\\n|\\r\\n|\\r|\\f)';
714 $decodeRegex =
"/ $backslash
716 ($nl) | # 1. Line continuation
717 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
718 (.) | # 3. backslash cancelling special meaning
719 () | # 4. backslash at end of string
722 $value = preg_replace_callback( $decodeRegex,
723 [ __CLASS__,
'cssDecodeCallback' ], $value );
728 if ( !preg_match(
'! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
739 $commentPos = strpos( $value,
'/*' );
740 if ( $commentPos !==
false ) {
741 $value = substr( $value, 0, $commentPos );
767 $value = self::normalizeCss( $value );
770 if ( preg_match(
'/[\000-\010\013\016-\037\177]/', $value ) ||
771 strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !==
false ) {
772 return '/* invalid control char */';
773 } elseif ( preg_match(
783 | attr\s*\([^)]+[\s,]+url
785 return '/* insecure input */';
794 private static function cssDecodeCallback(
$matches ) {
799 $char = UtfNormal\Utils::codepointToUtf8( hexdec(
$matches[2] ) );
805 if ( $char ==
"\n" || $char ==
'"' || $char ==
"'" || $char ==
'\\' ) {
808 return '\\' . dechex( ord( $char ) ) .
' ';
837 if ( trim( $text ) ==
'' ) {
841 $decoded = self::decodeTagAttributes( $text );
842 $stripped = self::validateTagAttributes( $decoded, $element );
848 return self::safeEncodeTagAttributes( $stripped );
857 $encValue = htmlspecialchars( $text, ENT_QUOTES );
862 $encValue = strtr( $encValue, [
883 # French spaces, last one Guillemet-left
884 # only if it isn't followed by a word character.
885 '/ (?=[?:;!%»›](?!\w))/u' =>
"$space",
886 # French spaces, Guillemet-right
887 '/([«‹]) /u' =>
"\\1$space",
889 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
899 $encValue = self::encodeAttribute( $text );
901 # Templates and links may be expanded in later parsing,
902 # creating invalid or dangerous output. Suppress this.
903 $encValue = strtr( $encValue, [
911 "''" =>
'''',
912 'ISBN' =>
'ISBN',
914 'PMID' =>
'PMID',
920 $encValue = preg_replace_callback(
923 return str_replace(
':',
':',
$matches[1] );
948 if ( $mode === self::ID_PRIMARY ) {
949 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
956 return self::escapeIdInternal( $id, $internalMode );
975 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
980 $id = self::escapeIdInternalUrl( $id, $mode );
1011 private static function escapeIdInternalUrl( $id, $mode ) {
1012 $id = self::escapeIdInternal( $id, $mode );
1013 if ( $mode ===
'html5' ) {
1014 $id = preg_replace(
'/%([a-fA-F0-9]{2})/',
'%25$1', $id );
1026 private static function escapeIdInternal( $id, $mode ) {
1029 $id = mb_substr( $id, 0, 1024 );
1037 $id = str_replace( [
"\t",
"\n",
"\f",
"\r",
" " ],
'_', $id );
1046 $id = urlencode( str_replace(
' ',
'_', $id ) );
1047 $id = strtr( $id, $replace );
1050 throw new InvalidArgumentException(
"Invalid mode '$mode' passed to '" . __METHOD__ );
1068 return self::escapeIdReferenceListInternal( $referenceString );
1078 private static function escapeIdReferenceListInternal( $referenceString ) {
1079 # Explode the space delimited list string into an array of tokens
1080 $references = preg_split(
'/\s+/',
"{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1082 # Escape each token as an id
1083 foreach ( $references as &$ref ) {
1084 $ref = self::escapeIdForAttribute( $ref );
1087 # Merge the array back to a space delimited list string
1088 # If the array is empty, the result will be an empty string ('')
1089 $referenceString = implode(
' ', $references );
1091 return $referenceString;
1107 return rtrim( preg_replace(
1108 [
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/' ],
1121 $html = self::decodeCharReferences( $html );
1122 # It seems wise to escape ' as well as ", as a matter of course. Can't
1123 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1124 # don't cause the entire string to disappear.
1125 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1137 public static function decodeTagAttributes( $text ) {
1138 if ( trim( $text ) == '' ) {
1143 if ( !preg_match_all(
1144 self::getAttribsRegex(),
1147 PREG_SET_ORDER ) ) {
1152 foreach ( $pairs as $set ) {
1153 $attribute = strtolower( $set[1] );
1155 // Filter attribute names with unacceptable characters
1156 if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1160 $value = self::getTagAttributeCallback( $set );
1162 // Normalize whitespace
1163 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1164 $value = trim( $value );
1166 // Decode character references
1167 $attribs[$attribute] = self::decodeCharReferences( $value );
1179 public static function safeEncodeTagAttributes( $assoc_array ) {
1181 foreach ( $assoc_array as $attribute => $value ) {
1182 $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1183 $encValue = self::safeEncodeAttribute( $value );
1185 $attribs[] = "$encAttribute=\"$encValue\"";
1187 return count( $attribs ) ?
' ' . implode(
' ', $attribs ) :
'';
1198 private static function getTagAttributeCallback( $set ) {
1199 if ( isset( $set[5] ) ) {
1202 } elseif ( isset( $set[4] ) ) {
1205 } elseif ( isset( $set[3] ) ) {
1208 } elseif ( !isset( $set[2] ) ) {
1209 # In XHTML, attributes must have a value so return an empty string.
1210 # See "Empty attribute syntax",
1214 throw new MWException(
"Tag conditions not met. This should never happen and is a bug." );
1222 private static function normalizeWhitespace( $text ) {
1223 return trim( preg_replace(
1224 '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1238 return trim( preg_replace(
'/[ _]+/',
' ', $section ) );
1257 return preg_replace_callback(
1258 self::CHAR_REFS_REGEX,
1259 [ self::class,
'normalizeCharReferencesCallback' ],
1267 private static function normalizeCharReferencesCallback(
$matches ) {
1270 $ret = self::normalizeEntity(
$matches[1] );
1272 $ret = self::decCharReference(
$matches[2] );
1274 $ret = self::hexCharReference(
$matches[3] );
1276 if ( $ret ===
null ) {
1277 return htmlspecialchars(
$matches[0], ENT_COMPAT );
1293 private static function normalizeEntity( $name ) {
1294 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1296 return '&' . self::MW_ENTITY_ALIASES[$name];
1297 } elseif ( in_array( $name, [
'lt;',
'gt;',
'amp;',
'quot;' ],
true ) ) {
1300 } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1302 return preg_replace_callback(
'/./Ssu',
static function ( $m ) {
1303 return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) .
';';
1304 }, HTMLData::$namedEntityTranslations[$name] );
1306 return "&$name";
1314 private static function decCharReference( $codepoint ) {
1315 $point = intval( $codepoint );
1316 if ( self::validateCodepoint( $point ) ) {
1317 return sprintf(
'&#%d;', $point );
1327 private static function hexCharReference( $codepoint ) {
1328 $point = hexdec( $codepoint );
1329 if ( self::validateCodepoint( $point ) ) {
1330 return sprintf(
'&#x%x;', $point );
1342 private static function validateCodepoint( $codepoint ) {
1343 # U+000C is valid in HTML5 but not allowed in XML.
1344 # U+000D is valid in XML but not allowed in HTML5.
1345 # U+007F - U+009F are disallowed in HTML5 (control characters).
1346 return $codepoint == 0x09
1347 || $codepoint == 0x0a
1348 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1349 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1350 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1351 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1362 return preg_replace_callback(
1363 self::CHAR_REFS_REGEX,
1364 [ self::class,
'decodeCharReferencesCallback' ],
1379 $text = preg_replace_callback(
1380 self::CHAR_REFS_REGEX,
1381 [ self::class,
'decodeCharReferencesCallback' ],
1388 return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1398 private static function decodeCharReferencesCallback(
$matches ) {
1400 return self::decodeEntity(
$matches[1] );
1402 return self::decodeChar( intval(
$matches[2] ) );
1404 return self::decodeChar( hexdec(
$matches[3] ) );
1406 # Last case should be an ampersand by itself
1417 private static function decodeChar( $codepoint ) {
1418 if ( self::validateCodepoint( $codepoint ) ) {
1419 return UtfNormal\Utils::codepointToUtf8( $codepoint );
1421 return UtfNormal\Constants::UTF8_REPLACEMENT;
1433 private static function decodeEntity( $name ) {
1435 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1436 $name = self::MW_ENTITY_ALIASES[$name];
1438 $trans = HTMLData::$namedEntityTranslations[$name] ??
null;
1439 return $trans ??
"&$name";
1449 private static function attributesAllowedInternal( $element ) {
1450 $list = self::setupAttributesAllowedInternal();
1451 return $list[$element] ?? [];
1461 private static function setupAttributesAllowedInternal() {
1464 if ( $allowed !==
null ) {
1470 $merge =
static function ( $a, $b, $c = [] ) {
1473 array_fill_keys( $b,
true ),
1474 array_fill_keys( $c,
true ) );
1476 $common = $merge( [], [
1496 # These attributes are specified in section 9 of
1504 # Microdata. These are specified by
1513 $block = $merge( $common, [
'align' ] );
1515 $tablealign = [
'align',
'valign' ];
1523 'nowrap', # deprecated
1524 'width', # deprecated
1525 'height', # deprecated
1526 'bgcolor', # deprecated
1529 # Numbers refer to sections in HTML 4.01 standard describing the element.
1534 'center' => $common, # deprecated
1553 'strong' => $common,
1564 'blockquote' => $merge( $common, [
'cite' ] ),
1565 'q' => $merge( $common, [
'cite' ] ),
1575 'br' => $merge( $common, [
'clear' ] ),
1581 'pre' => $merge( $common, [
'width' ] ),
1584 'ins' => $merge( $common, [
'cite',
'datetime' ] ),
1585 'del' => $merge( $common, [
'cite',
'datetime' ] ),
1588 'ul' => $merge( $common, [
'type' ] ),
1589 'ol' => $merge( $common, [
'type',
'start',
'reversed' ] ),
1590 'li' => $merge( $common, [
'type',
'value' ] ),
1598 'table' => $merge( $common,
1599 [
'summary',
'width',
'border',
'frame',
1600 'rules',
'cellspacing',
'cellpadding',
1605 'caption' => $block,
1613 'colgroup' => $merge( $common, [
'span' ] ),
1614 'col' => $merge( $common, [
'span' ] ),
1617 'tr' => $merge( $common, [
'bgcolor' ], $tablealign ),
1620 'td' => $merge( $common, $tablecell, $tablealign ),
1621 'th' => $merge( $common, $tablecell, $tablealign ),
1624 # NOTE: <a> is not allowed directly, but this list of allowed
1625 # attributes is used from the Parser object
1626 'a' => $merge( $common, [
'href',
'rel',
'rev' ] ), # rel/rev esp.
for RDFa
1629 # Not usually allowed, but may be used for extension-style hooks
1630 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1632 'img' => $merge( $common, [
'alt',
'src',
'width',
'height',
'srcset' ] ),
1633 # Attributes for A/V tags added in T163583 / T133673
1634 'audio' => $merge( $common, [
'controls',
'preload',
'width',
'height' ] ),
1635 'video' => $merge( $common, [
'poster',
'controls',
'preload',
'width',
'height' ] ),
1636 'source' => $merge( $common, [
'type',
'src' ] ),
1637 'track' => $merge( $common, [
'type',
'src',
'srclang',
'kind',
'label' ] ),
1645 'strike' => $common,
1650 'font' => $merge( $common, [
'size',
'color',
'face' ] ),
1654 'hr' => $merge( $common, [
'width' ] ),
1656 # HTML Ruby annotation text module, simple ruby only.
1662 'rt' => $common, # $merge( $common, [
'rbspan' ] ),
1665 # MathML root element, where used for extensions
1666 # 'title' may not be 100% valid here; it's XHTML
1668 'math' => $merge( [], [
'class',
'style',
'id',
'title' ] ),
1671 'figure' => $common,
1672 'figcaption' => $common,
1674 # HTML 5 section 4.6
1677 # HTML5 elements, defined by:
1679 'data' => $merge( $common, [
'value' ] ),
1680 'time' => $merge( $common, [
'datetime' ] ),
1688 'meta' => $merge( [], [
'itemprop',
'content' ] ),
1689 'link' => $merge( [], [
'itemprop',
'href',
'title' ] ),
1691 # HTML 5 section 4.3.5
1712 $tokenizer =
new RemexTokenizer( $handler, $html, [
1713 'ignoreErrors' =>
true,
1715 'ignoreNulls' =>
true,
1716 'skipPreprocess' =>
true,
1718 $tokenizer->execute();
1719 $text = $handler->getResult();
1721 $text = self::normalizeWhitespace( $text );
1737 $out =
"<!DOCTYPE html [\n";
1738 foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1739 if ( substr( $entity, -1 ) !==
';' ) {
1744 $name = substr( $entity, 0, -1 );
1745 $expansion = self::normalizeEntity( $entity );
1746 if ( $entity === $expansion ) {
1750 $out .=
"<!ENTITY $name \"$expansion\">";
1761 # Normalize any HTML entities in input. They will be
1762 # re-escaped by makeExternalLink().
1763 $url = self::decodeCharReferences( $url );
1765 # Escape any control characters introduced by the above step
1766 $url = preg_replace_callback(
'/[\][<>"\\x00-\\x20\\x7F\|]/',
1767 [ __CLASS__,
'cleanUrlCallback' ], $url );
1769 # Validate hostname portion
1771 if ( preg_match(
'!^([^:]+:)(//[^/]+)?(.*)$!iD', $url,
$matches ) ) {
1772 list( , $protocol, $host, $rest ) =
$matches;
1779 \\s| # general whitespace
1780 \u{00AD}| # SOFT HYPHEN
1781 \u{034F}| # COMBINING GRAPHEME JOINER
1782 \u{061C}| # ARABIC LETTER MARK
1783 [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER..
1784 # HANGUL JUNGSEONG FILLER
1785 [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ..
1786 # KHMER VOWEL INHERENT AA
1787 [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE..
1788 # MONGOLIAN FREE VARIATION SELECTOR THREE
1789 \u{180E}| # MONGOLIAN VOWEL SEPARATOR
1790 [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE..
1791 # RIGHT-TO-LEFT MARK
1792 [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING..
1793 # RIGHT-TO-LEFT OVERRIDE
1794 [\u{2060}-\u{2064}]| # WORD JOINER..
1796 \u{2065}| # <reserved-2065>
1797 [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE..
1798 # NOMINAL DIGIT SHAPES
1799 \u{3164}| # HANGUL FILLER
1800 [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1..
1801 # VARIATION SELECTOR-16
1802 \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE
1803 \u{FFA0}| # HALFWIDTH HANGUL FILLER
1804 [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>..
1806 [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP..
1807 # SHORTHAND FORMAT UP STEP
1808 [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM..
1809 # MUSICAL SYMBOL END PHRASE
1810 \u{E0000}| # <reserved-E0000>
1811 \u{E0001}| # LANGUAGE TAG
1812 [\u{E0002}-\u{E001F}]| # <reserved-E0002>..
1814 [\u{E0020}-\u{E007F}]| # TAG SPACE..
1816 [\u{E0080}-\u{E00FF}]| # <reserved-E0080>..
1818 [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17..
1819 # VARIATION SELECTOR-256
1820 [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>..
1824 $host = preg_replace( $strip,
'', $host );
1827 if ( str_starts_with( $host,
"//%5B" ) &&
1828 preg_match(
'!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host,
$matches )
1835 return $protocol . $host . $rest;
1845 private static function cleanUrlCallback(
$matches ) {
1880 if ( !Hooks::runner()->onIsValidEmailAddr( $addr, $result ) ) {
1887 $rfc5322_atext =
"a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1888 $rfc1034_ldh_str =
"a-z0-9\\-";
1890 $html5_email_regexp =
"/
1892 [$rfc5322_atext\\.]+ # user part which is liberal :p
1894 [$rfc1034_ldh_str]+ # First domain part
1895 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1899 return (
bool)preg_match( $html5_email_regexp, $addr );
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that a deprecated feature was used.
HTML sanitizer for MediaWiki.
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
static validateAttributes( $attribs, $allowed)
Take an array of attribute names and values and normalize or discard illegal values.
static armorFrenchSpaces( $text, $space=' ')
Armor French spaces with a replacement character.
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static removeHTMLcomments( $text)
Remove '', and everything between.
static encodeAttribute( $text)
Encode an attribute value for HTML output.
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
static removeSomeTags(string $text, array $options=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; the result will alw...
static internalRemoveHtmlTags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments; BEWARE there may be...
static escapeIdReferenceList( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static validateEmail( $addr)
Does a string look like an e-mail address?
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
$wgAllowImageTag
Config variable stub for the AllowImageTag setting, for use by phpdoc and IDEs.
$wgFragmentMode
Config variable stub for the FragmentMode setting, for use by phpdoc and IDEs.
$wgExternalInterwikiFragmentMode
Config variable stub for the ExternalInterwikiFragmentMode setting, for use by phpdoc and IDEs.