39 '/&([A-Za-z0-9\x80-\xff]+); 41 |&\#[xX]([0-9A-Fa-f]+); 357 if ( self::$attribsRegex === null ) {
358 $spaceChars =
'\x09\x0a\x0c\x0d\x20';
359 $space =
"[{$spaceChars}]";
360 $attrib =
"[^{$spaceChars}\/>=]";
361 $attribFirst =
"(?:{$attrib}|=)";
362 self::$attribsRegex =
363 "/({$attribFirst}{$attrib}*) 366 # The attribute value: quoted or alone 373 return self::$attribsRegex;
386 if ( self::$attribNameRegex === null ) {
387 $attribFirst =
"[:_\p{L}\p{N}]";
388 $attrib =
"[:_\.\-\p{L}\p{N}]";
389 self::$attribNameRegex =
"/^({$attribFirst}{$attrib}*)$/sxu";
391 return self::$attribNameRegex;
403 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
404 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
409 if ( !$staticInitialised || $staticInitialised != $globalContext ) {
410 $htmlpairsStatic = [ # Tags that must be closed
411 'b',
'bdi',
'del',
'i',
'ins',
'u',
'font',
'big',
'small',
'sub',
'sup',
'h1',
412 'h2',
'h3',
'h4',
'h5',
'h6',
'cite',
'code',
'em',
's',
413 'strike',
'strong',
'tt',
'var',
'div',
'center',
414 'blockquote',
'ol',
'ul',
'dl',
'table',
'caption',
'pre',
415 'ruby',
'rb',
'rp',
'rt',
'rtc',
'p',
'span',
'abbr',
'dfn',
416 'kbd',
'samp',
'data',
'time',
'mark' 419 'br',
'wbr',
'hr',
'li',
'dt',
'dd',
'meta',
'link' 422 # Elements that cannot have close tags. This is (not coincidentally) 423 # also the list of tags for which the HTML 5 parsing algorithm 424 # requires you to "acknowledge the token's self-closing flag", i.e. 425 # a self-closing tag like <br/> is not an HTML 5 parse error only 428 'br',
'wbr',
'hr',
'meta',
'link' 431 $htmlnest = [ # Tags that can be nested--??
432 'table',
'tr',
'td',
'th',
'div',
'blockquote',
'ol',
'ul',
433 'li',
'dl',
'dt',
'dd',
'font',
'big',
'small',
'sub',
'sup',
'span',
434 'var',
'kbd',
'samp',
'em',
'strong',
'q',
'ruby',
'bdo' 436 $tabletags = [ # Can only appear inside table, we will close them
439 $htmllist = [ # Tags used by list
442 $listtags = [ # Tags that can appear in a list
446 if ( $wgAllowImageTag ) {
447 $htmlsingle[] =
'img';
448 $htmlsingleonly[] =
'img';
451 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
452 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
454 # Convert them all to hashtables for faster lookup 455 $vars = [
'htmlpairsStatic',
'htmlsingle',
'htmlsingleonly',
'htmlnest',
'tabletags',
456 'htmllist',
'listtags',
'htmlsingleallowed',
'htmlelementsStatic' ];
457 foreach ( $vars as $var ) {
458 $$var = array_flip( $$var );
460 $staticInitialised = $globalContext;
463 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays 464 $extratags = array_flip( $extratags );
465 $removetags = array_flip( $removetags );
466 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
467 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
470 'htmlpairs' => $htmlpairs,
471 'htmlsingle' => $htmlsingle,
472 'htmlsingleonly' => $htmlsingleonly,
473 'htmlnest' => $htmlnest,
474 'tabletags' => $tabletags,
475 'htmllist' => $htmllist,
476 'listtags' => $listtags,
477 'htmlsingleallowed' => $htmlsingleallowed,
478 'htmlelements' => $htmlelements,
498 $args = [], $extratags = [], $removetags = [], $warnCallback = null
500 $tagData = self::getRecognizedTagData( $extratags, $removetags );
501 $htmlpairs = $tagData[
'htmlpairs'];
502 $htmlsingle = $tagData[
'htmlsingle'];
503 $htmlsingleonly = $tagData[
'htmlsingleonly'];
504 $htmlnest = $tagData[
'htmlnest'];
505 $tabletags = $tagData[
'tabletags'];
506 $htmllist = $tagData[
'htmllist'];
507 $listtags = $tagData[
'listtags'];
508 $htmlsingleallowed = $tagData[
'htmlsingleallowed'];
509 $htmlelements = $tagData[
'htmlelements'];
511 # Remove HTML comments 512 $text = self::removeHTMLcomments( $text );
513 $bits = explode(
'<', $text );
514 $text = str_replace(
'>',
'>', array_shift( $bits ) );
517 $tagstack = $tablestack = [];
518 foreach ( $bits as $x ) {
520 # $slash: Does the current element start with a '/'? 521 # $t: Current element name 522 # $params: String between element name and > 523 # $brace: Ending '>' or '/>' 524 # $rest: Everything until the next element of $bits 525 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
526 list( , $slash,
$t, $params, $brace, $rest ) = $regs;
528 $slash =
$t = $params = $brace = $rest = null;
532 $t = strtolower(
$t );
533 if ( isset( $htmlelements[
$t] ) ) {
535 if ( $slash && isset( $htmlsingleonly[$t] ) ) {
537 } elseif ( $slash ) {
538 # Closing a tag... is it the one we just opened? 539 Wikimedia\suppressWarnings();
540 $ot = array_pop( $tagstack );
541 Wikimedia\restoreWarnings();
544 if ( isset( $htmlsingleallowed[$ot] ) ) {
545 # Pop all elements with an optional close tag 546 # and see if we find a match below them 548 array_push( $optstack, $ot );
549 Wikimedia\suppressWarnings();
550 $ot = array_pop( $tagstack );
551 Wikimedia\restoreWarnings();
552 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
553 array_push( $optstack, $ot );
554 Wikimedia\suppressWarnings();
555 $ot = array_pop( $tagstack );
556 Wikimedia\restoreWarnings();
559 # No match. Push the optional elements back again 561 Wikimedia\suppressWarnings();
562 $ot = array_pop( $optstack );
563 Wikimedia\restoreWarnings();
565 array_push( $tagstack, $ot );
566 Wikimedia\suppressWarnings();
567 $ot = array_pop( $optstack );
568 Wikimedia\restoreWarnings();
572 Wikimedia\suppressWarnings();
573 array_push( $tagstack, $ot );
574 Wikimedia\restoreWarnings();
576 # <li> can be nested in <ul> or <ol>, skip those cases: 577 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
581 } elseif ( $t ==
'table' ) {
582 $tagstack = array_pop( $tablestack );
586 # Keep track for later 587 if ( isset( $tabletags[$t] ) && !in_array(
'table', $tagstack ) ) {
589 } elseif ( in_array( $t, $tagstack ) && !isset( $htmlnest[$t] ) ) {
591 # Is it a self closed htmlpair ? (T7487) 592 } elseif ( $brace ==
'/>' && isset( $htmlpairs[$t] ) ) {
598 if ( is_callable( $warnCallback ) ) {
599 call_user_func_array( $warnCallback, [
'deprecated-self-close-category' ] );
602 } elseif ( isset( $htmlsingleonly[$t] ) ) {
603 # Hack to force empty tag for unclosable elements 605 } elseif ( isset( $htmlsingle[$t] ) ) {
606 # Hack to not close $htmlsingle tags 608 # Still need to push this optionally-closed tag to 609 # the tag stack so that we can match end tags 610 # instead of marking them as bad. 611 array_push( $tagstack, $t );
612 } elseif ( isset( $tabletags[$t] ) && in_array( $t, $tagstack ) ) {
616 if ( $t ==
'table' ) {
617 array_push( $tablestack, $tagstack );
620 array_push( $tagstack, $t );
623 # Replace any variables or template parameters with 625 if ( is_callable( $processCallback ) ) {
626 call_user_func_array( $processCallback, [ &$params,
$args ] );
629 if ( !self::validateTag( $params, $t ) ) {
633 # Strip non-approved attributes from the tag 634 $newparams = self::fixTagAttributes( $params, $t );
637 $rest = str_replace(
'>',
'>', $rest );
638 $close = ( $brace ==
'/>' && !$slash ) ?
' /' :
'';
639 $text .=
"<$slash$t$newparams$close>$rest";
643 $text .=
'<' . str_replace(
'>',
'>', $x );
645 # Close off any remaining tags 646 while ( is_array( $tagstack ) && (
$t = array_pop( $tagstack ) ) ) {
648 if (
$t ==
'table' ) {
649 $tagstack = array_pop( $tablestack );
653 # this might be possible using tidy itself 654 foreach ( $bits as $x ) {
655 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
656 list( , $slash,
$t, $params, $brace, $rest ) = $regs;
659 $t = strtolower(
$t );
660 if ( isset( $htmlelements[
$t] ) ) {
661 if ( is_callable( $processCallback ) ) {
662 call_user_func_array( $processCallback, [ &$params,
$args ] );
665 if ( $brace ==
'/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
671 if ( is_callable( $warnCallback ) ) {
672 call_user_func_array( $warnCallback, [
'deprecated-self-close-category' ] );
675 if ( !self::validateTag( $params, $t ) ) {
679 $newparams = self::fixTagAttributes( $params, $t );
681 if ( $brace ===
'/>' && !isset( $htmlsingleonly[$t] ) ) {
682 # Interpret self-closing tags as empty tags even when 683 # HTML 5 would interpret them as start tags. Such input 684 # is commonly seen on Wikimedia wikis with this intention. 688 $rest = str_replace(
'>',
'>', $rest );
689 $text .=
"<$slash$t$newparams$brace$rest";
694 $text .=
'<' . str_replace(
'>',
'>', $x );
710 while ( ( $start = strpos( $text,
'<!--' ) ) !==
false ) {
711 $end = strpos( $text,
'-->', $start + 4 );
712 if ( $end ===
false ) {
713 # Unterminated comment; bail out 719 # Trim space and newline if the comment is both 720 # preceded and followed by a newline 721 $spaceStart = max( $start - 1, 0 );
722 $spaceLen = $end - $spaceStart;
723 while ( substr( $text, $spaceStart, 1 ) ===
' ' && $spaceStart > 0 ) {
727 while ( substr( $text, $spaceStart + $spaceLen, 1 ) ===
' ' ) {
730 if ( substr( $text, $spaceStart, 1 ) ===
"\n" 731 && substr( $text, $spaceStart + $spaceLen, 1 ) ===
"\n" ) {
732 # Remove the comment, leading and trailing 733 # spaces, and leave only one newline. 734 $text = substr_replace( $text,
"\n", $spaceStart, $spaceLen + 1 );
736 # Remove just the comment. 737 $text = substr_replace( $text,
'', $start, $end - $start );
756 $params = self::decodeTagAttributes( $params );
758 if ( $element ==
'meta' || $element ==
'link' ) {
759 if ( !isset( $params[
'itemprop'] ) ) {
763 if ( $element ==
'meta' && !isset( $params[
'content'] ) ) {
767 if ( $element ==
'link' && !isset( $params[
'href'] ) ) {
792 return self::validateAttributes( $attribs,
793 self::attributeWhitelistInternal( $element ) );
814 if ( isset( $whitelist[0] ) ) {
818 $whitelist = array_flip( $whitelist );
823 foreach ( $attribs as $attribute => $value ) {
824 # Allow XML namespace declaration to allow RDFa 825 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
826 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
827 $out[$attribute] = $value;
833 # Allow any attribute beginning with "data-" 835 # * Disallow data attributes used by MediaWiki code 836 # * Ensure that the attribute is not namespaced by banning 839 !preg_match(
'/^data-[^:]*$/i', $attribute ) &&
840 !array_key_exists( $attribute, $whitelist )
841 ) || self::isReservedDataAttribute( $attribute ) ) {
845 # Strip javascript "expression" from stylesheets. 846 # https://msdn.microsoft.com/en-us/library/ms537634.aspx 847 if ( $attribute ==
'style' ) {
848 $value = self::checkCss( $value );
851 # Escape HTML id attributes 852 if ( $attribute ===
'id' ) {
853 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
856 # Escape HTML id reference lists 857 if ( $attribute ===
'aria-describedby' 858 || $attribute ===
'aria-flowto' 859 || $attribute ===
'aria-labelledby' 860 || $attribute ===
'aria-owns' 862 $value = self::escapeIdReferenceList( $value );
867 if ( $attribute ===
'rel' || $attribute ===
'rev' 869 || $attribute ===
'about' || $attribute ===
'property' 870 || $attribute ===
'resource' || $attribute ===
'datatype' 871 || $attribute ===
'typeof' 873 || $attribute ===
'itemid' || $attribute ===
'itemprop' 874 || $attribute ===
'itemref' || $attribute ===
'itemscope' 875 || $attribute ===
'itemtype' 878 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
883 # NOTE: even though elements using href/src are not allowed directly, supply 884 # validation code that can be used by tag hook handlers, etc 885 if ( $attribute ===
'href' || $attribute ===
'src' || $attribute ===
'poster' ) {
886 if ( !preg_match( $hrefExp, $value ) ) {
894 $out[$attribute] = $value;
897 # itemtype, itemid, itemref don't make sense without itemscope 898 if ( !array_key_exists(
'itemscope', $out ) ) {
899 unset( $out[
'itemtype'] );
900 unset( $out[
'itemid'] );
901 unset( $out[
'itemref'] );
903 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref. 923 return (
bool)preg_match(
'/^data-(ooui|mw|parsoid)/i', $attr );
937 $out = array_merge( $a, $b );
938 if ( isset( $a[
'class'] ) && isset( $b[
'class'] )
939 && is_string( $a[
'class'] ) && is_string( $b[
'class'] )
940 && $a[
'class'] !== $b[
'class']
942 $classes = preg_split(
'/\s+/',
"{$a['class']} {$b['class']}",
943 -1, PREG_SPLIT_NO_EMPTY );
944 $out[
'class'] = implode(
' ', array_unique( $classes ) );
959 $value = self::decodeCharReferences( $value );
971 if ( !$decodeRegex ) {
972 $space =
'[\\x20\\t\\r\\n\\f]';
973 $nl =
'(?:\\n|\\r\\n|\\r|\\f)';
975 $decodeRegex =
"/ $backslash 977 ($nl) | # 1. Line continuation 978 ([0-9A-Fa-f]{1,6})$space? | # 2. character number 979 (.) | # 3. backslash cancelling special meaning 980 () | # 4. backslash at end of string 983 $value = preg_replace_callback( $decodeRegex,
984 [ __CLASS__,
'cssDecodeCallback' ], $value );
989 if ( !preg_match(
'! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
1000 $commentPos = strpos( $value,
'/*' );
1001 if ( $commentPos !==
false ) {
1002 $value = substr( $value, 0, $commentPos );
1028 $value = self::normalizeCss( $value );
1031 if ( preg_match(
'/[\000-\010\013\016-\037\177]/', $value ) ||
1032 strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !==
false ) {
1033 return '/* invalid control char */';
1034 } elseif ( preg_match(
1039 | -o-link-source\s*: 1044 | attr\s*\([^)]+[\s,]+url 1047 return '/* insecure input */';
1061 $char = UtfNormal\Utils::codepointToUtf8( hexdec(
$matches[2] ) );
1067 if ( $char ==
"\n" || $char ==
'"' || $char ==
"'" || $char ==
'\\' ) {
1070 return '\\' . dechex( ord( $char ) ) .
' ';
1099 if ( trim( $text ) ==
'' ) {
1103 $decoded = self::decodeTagAttributes( $text );
1104 $stripped = self::validateTagAttributes( $decoded, $element );
1110 return self::safeEncodeTagAttributes( $stripped );
1119 $encValue = htmlspecialchars( $text, ENT_QUOTES );
1124 $encValue = strtr( $encValue, [
1145 # French spaces, last one Guillemet-left 1146 # only if there is something before the space 1147 # and a non-word character after the punctuation. 1148 '/(?<=\S) (?=[?:;!%»›](?!\w))/u' =>
"$space",
1149 # French spaces, Guillemet-right 1150 '/([«‹]) /u' =>
"\\1$space",
1152 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
1162 $encValue = self::encodeAttribute( $text );
1164 # Templates and links may be expanded in later parsing, 1165 # creating invalid or dangerous output. Suppress this. 1166 $encValue = strtr( $encValue, [
1174 "''" =>
'''',
1175 'ISBN' =>
'ISBN',
1177 'PMID' =>
'PMID',
1182 # Armor against French spaces detection (T5158) 1183 $encValue = self::armorFrenchSpaces( $encValue,
' ' );
1186 $encValue = preg_replace_callback(
1189 return str_replace(
':',
':',
$matches[1] );
1224 $options = (array)$options;
1232 $id = urlencode( strtr( $id,
' ',
'_' ) );
1233 $id = strtr( $id, $replace );
1235 if ( !preg_match(
'/^[a-zA-Z]/', $id ) && !in_array(
'noninitial', $options ) ) {
1260 if ( !isset( $wgFragmentMode[$mode] ) ) {
1261 if ( $mode === self::ID_PRIMARY ) {
1267 $internalMode = $wgFragmentMode[$mode];
1269 return self::escapeIdInternal( $id, $internalMode );
1287 if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
1291 $mode = $wgFragmentMode[self::ID_PRIMARY];
1293 $id = self::escapeIdInternal( $id, $mode );
1310 $id = self::escapeIdInternal( $id, $wgExternalInterwikiFragmentMode );
1325 $id = str_replace(
' ',
'_', $id );
1334 $id = urlencode( str_replace(
' ',
'_', $id ) );
1335 $id = strtr( $id, $replace );
1354 # Explode the space delimited list string into an array of tokens 1355 $references = preg_split(
'/\s+/',
"{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1357 # Escape each token as an id 1358 foreach ( $references as &$ref ) {
1359 $ref = self::escapeIdForAttribute( $ref );
1362 # Merge the array back to a space delimited list string 1363 # If the array is empty, the result will be an empty string ('') 1364 $referenceString = implode(
' ', $references );
1366 return $referenceString;
1382 return rtrim( preg_replace(
1383 [
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/' ],
1396 $html = self::decodeCharReferences( $html );
1397 # It seems wise to escape ' as well as ", as a matter of course. Can't 1398 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters 1399 # don't cause the entire string to disappear. 1400 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1413 if ( trim( $text ) ==
'' ) {
1418 if ( !preg_match_all(
1419 self::getAttribsRegex(),
1422 PREG_SET_ORDER ) ) {
1427 foreach ( $pairs as $set ) {
1428 $attribute = strtolower( $set[1] );
1431 if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1435 $value = self::getTagAttributeCallback( $set );
1438 $value = preg_replace(
'/[\t\r\n ]+/',
' ', $value );
1439 $value = trim( $value );
1442 $attribs[$attribute] = self::decodeCharReferences( $value );
1456 foreach ( $assoc_array as $attribute => $value ) {
1457 $encAttribute = htmlspecialchars( $attribute );
1458 $encValue = self::safeEncodeAttribute( $value );
1460 $attribs[] =
"$encAttribute=\"$encValue\"";
1462 return count( $attribs ) ?
' ' . implode(
' ', $attribs ) :
'';
1474 if ( isset( $set[5] ) ) {
1477 } elseif ( isset( $set[4] ) ) {
1480 } elseif ( isset( $set[3] ) ) {
1483 } elseif ( !isset( $set[2] ) ) {
1484 # In XHTML, attributes must have a value so return an empty string. 1485 # See "Empty attribute syntax", 1486 # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name 1489 throw new MWException(
"Tag conditions not met. This should never happen and is a bug." );
1498 return trim( preg_replace(
1499 '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1513 return trim( preg_replace(
'/[ _]+/',
' ', $section ) );
1532 return preg_replace_callback(
1533 self::CHAR_REFS_REGEX,
1534 [ self::class,
'normalizeCharReferencesCallback' ],
1545 $ret = self::normalizeEntity(
$matches[1] );
1547 $ret = self::decCharReference(
$matches[2] );
1549 $ret = self::hexCharReference(
$matches[3] );
1551 if ( is_null( $ret ) ) {
1552 return htmlspecialchars(
$matches[0] );
1569 if ( isset( self::HTML_ENTITY_ALIASES[$name] ) ) {
1570 return '&' . self::HTML_ENTITY_ALIASES[$name] .
';';
1571 } elseif ( in_array( $name, [
'lt',
'gt',
'amp',
'quot' ] ) ) {
1573 } elseif ( isset( self::HTML_ENTITIES[$name] ) ) {
1574 return '&#' . self::HTML_ENTITIES[$name] .
';';
1576 return "&$name;";
1585 $point = intval( $codepoint );
1586 if ( self::validateCodepoint( $point ) ) {
1587 return sprintf(
'&#%d;', $point );
1598 $point = hexdec( $codepoint );
1599 if ( self::validateCodepoint( $point ) ) {
1600 return sprintf(
'&#x%x;', $point );
1613 # U+000C is valid in HTML5 but not allowed in XML. 1614 # U+000D is valid in XML but not allowed in HTML5. 1615 # U+007F - U+009F are disallowed in HTML5 (control characters). 1616 return $codepoint == 0x09
1617 || $codepoint == 0x0a
1618 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1619 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1620 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1621 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1632 return preg_replace_callback(
1633 self::CHAR_REFS_REGEX,
1634 [ self::class,
'decodeCharReferencesCallback' ],
1649 $text = preg_replace_callback(
1650 self::CHAR_REFS_REGEX,
1651 [ self::class,
'decodeCharReferencesCallback' ],
1658 return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1670 return self::decodeEntity(
$matches[1] );
1672 return self::decodeChar( intval(
$matches[2] ) );
1674 return self::decodeChar( hexdec(
$matches[3] ) );
1676 # Last case should be an ampersand by itself 1688 if ( self::validateCodepoint( $codepoint ) ) {
1689 return UtfNormal\Utils::codepointToUtf8( $codepoint );
1691 return UtfNormal\Constants::UTF8_REPLACEMENT;
1704 if ( isset( self::HTML_ENTITY_ALIASES[$name] ) ) {
1705 $name = self::HTML_ENTITY_ALIASES[$name];
1707 if ( isset( self::HTML_ENTITIES[$name] ) ) {
1708 return UtfNormal\Utils::codepointToUtf8( self::HTML_ENTITIES[$name] );
1723 $list = self::setupAttributeWhitelist();
1724 return $list[$element] ?? [];
1735 $list = self::setupAttributeWhitelistInternal();
1736 return $list[$element] ?? [];
1747 $wlist = self::setupAttributeWhitelistInternal();
1750 return array_map(
function ( $v ) {
1751 return array_keys( $v );
1765 if ( $whitelist !== null ) {
1771 $merge =
function ( $a, $b, $c = [] ) {
1772 return array_merge( $a, array_flip( $b ), array_flip( $c ) );
1774 $common = $merge( [], [
1792 # These attributes are specified in section 9 of
1800 # Microdata. These are specified by
1809 $block = $merge( $common, [
'align' ] );
1811 $tablealign = [
'align',
'valign' ];
1819 'nowrap', # deprecated
1820 'width', # deprecated
1821 'height', # deprecated
1822 'bgcolor', # deprecated
1825 # Numbers refer to sections in HTML 4.01 standard describing the element. 1826 # See: https://www.w3.org/TR/html4/ 1830 'center' => $common, # deprecated
1849 'strong' => $common,
1860 'blockquote' => $merge( $common, [
'cite' ] ),
1861 'q' => $merge( $common, [
'cite' ] ),
1871 'br' => $merge( $common, [
'clear' ] ),
1873 # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element 1877 'pre' => $merge( $common, [
'width' ] ),
1880 'ins' => $merge( $common, [
'cite',
'datetime' ] ),
1881 'del' => $merge( $common, [
'cite',
'datetime' ] ),
1884 'ul' => $merge( $common, [
'type' ] ),
1885 'ol' => $merge( $common, [
'type',
'start',
'reversed' ] ),
1886 'li' => $merge( $common, [
'type',
'value' ] ),
1894 'table' => $merge( $common,
1895 [
'summary',
'width',
'border',
'frame',
1896 'rules',
'cellspacing',
'cellpadding',
1901 'caption' => $block,
1909 'colgroup' => $merge( $common, [
'span' ] ),
1910 'col' => $merge( $common, [
'span' ] ),
1913 'tr' => $merge( $common, [
'bgcolor' ], $tablealign ),
1916 'td' => $merge( $common, $tablecell, $tablealign ),
1917 'th' => $merge( $common, $tablecell, $tablealign ),
1920 # NOTE: <a> is not allowed directly, but the attrib 1921 # whitelist is used from the Parser object 1922 'a' => $merge( $common, [
'href',
'rel',
'rev' ] ), # rel/rev esp.
for RDFa
1925 # Not usually allowed, but may be used for extension-style hooks 1926 # such as <math> when it is rasterized, or if $wgAllowImageTag is 1928 'img' => $merge( $common, [
'alt',
'src',
'width',
'height',
'srcset' ] ),
1929 # Attributes for A/V tags added in T163583 / T133673 1930 'audio' => $merge( $common, [
'controls',
'preload',
'width',
'height' ] ),
1931 'video' => $merge( $common, [
'poster',
'controls',
'preload',
'width',
'height' ] ),
1932 'source' => $merge( $common, [
'type',
'src' ] ),
1933 'track' => $merge( $common, [
'type',
'src',
'srclang',
'kind',
'label' ] ),
1941 'strike' => $common,
1946 'font' => $merge( $common, [
'size',
'color',
'face' ] ),
1950 'hr' => $merge( $common, [
'width' ] ),
1952 # HTML Ruby annotation text module, simple ruby only. 1953 # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element 1958 'rt' => $common, # $merge( $common, [
'rbspan' ] ),
1961 # MathML root element, where used for extensions 1962 # 'title' may not be 100% valid here; it's XHTML 1963 # https://www.w3.org/TR/REC-MathML/ 1964 'math' => $merge( [], [
'class',
'style',
'id',
'title' ] ),
1967 'figure' => $common,
1968 'figure-inline' => $common, # T118520
1969 'figcaption' => $common,
1971 # HTML 5 section 4.6 1974 # HTML5 elements, defined by: 1975 # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element 1976 'data' => $merge( $common, [
'value' ] ),
1977 'time' => $merge( $common, [
'datetime' ] ),
1985 'meta' => $merge( [], [
'itemprop',
'content' ] ),
1986 'link' => $merge( [], [
'itemprop',
'href',
'title' ] ),
2007 'ignoreErrors' =>
true,
2009 'ignoreNulls' =>
true,
2010 'skipPreprocess' =>
true,
2012 $tokenizer->execute();
2013 $text = $handler->getResult();
2015 $text = self::normalizeWhitespace( $text );
2029 $out =
"<!DOCTYPE html [\n";
2030 foreach ( self::HTML_ENTITIES as $entity => $codepoint ) {
2031 $out .=
"<!ENTITY $entity \"&#$codepoint;\">";
2042 # Normalize any HTML entities in input. They will be 2043 # re-escaped by makeExternalLink(). 2044 $url = self::decodeCharReferences( $url );
2046 # Escape any control characters introduced by the above step 2047 $url = preg_replace_callback(
'/[\][<>"\\x00-\\x20\\x7F\|]/',
2048 [ __CLASS__,
'cleanUrlCallback' ], $url );
2050 # Validate hostname portion 2052 if ( preg_match(
'!^([^:]+:)(//[^/]+)?(.*)$!iD', $url,
$matches ) ) {
2053 list( , $protocol, $host, $rest ) =
$matches;
2059 \\s| # general whitespace 2060 \xc2\xad| # 00ad SOFT HYPHEN 2061 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN 2062 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE 2063 \xe2\x81\xa0| # 2060 WORD JOINER 2064 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE 2065 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER 2066 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE 2067 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO 2068 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE 2069 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER 2070 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER 2071 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16 2074 $host = preg_replace( $strip,
'', $host );
2077 if ( substr_compare(
"//%5B", $host, 0, 5 ) === 0 &&
2078 preg_match(
'!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host,
$matches )
2085 return $protocol . $host . $rest;
2129 if ( !
Hooks::run(
'isValidEmailAddr', [ $addr, &$result ] ) ) {
2136 $rfc5322_atext =
"a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
2137 $rfc1034_ldh_str =
"a-z0-9\\-";
2139 $html5_email_regexp =
"/ 2141 [$rfc5322_atext\\.]+ # user part which is liberal :p 2143 [$rfc1034_ldh_str]+ # First domain part 2144 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot 2148 return (
bool)preg_match( $html5_email_regexp, $addr );
$wgExternalInterwikiFragmentMode
Which ID escaping mode should be used for external interwiki links? See documentation for $wgFragment...
static armorFrenchSpaces( $text, $space=' ')
Armor French spaces with a replacement character.
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
static escapeId( $id, $options=[])
Given a value, escape it so that it can be used in an id attribute and return it. ...
static normalizeWhitespace( $text)
static attributeWhitelistInternal( $element)
Fetch the whitelist of acceptable attributes for a given element name.
const HTML_ENTITIES
List of all named character entities defined in HTML 4.01 https://www.w3.org/TR/html4/sgml/entities.html As well as ' which is only defined starting in XHTML1.
const const HTML_ENTITY_ALIASES
Character entity aliases accepted by MediaWiki.
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
static setupAttributeWhitelist()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
static escapeIdReferenceList( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
static $attribNameRegex
Lazy-initialised attribute name regex, see getAttribNameRegex()
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
static hexCharReference( $codepoint)
static attributeWhitelist( $element)
Fetch the whitelist of acceptable attributes for a given element name.
static escapeIdInternal( $id, $mode)
Helper for escapeIdFor*() functions.
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
static validateEmail( $addr)
Does a string look like an e-mail address?
static decCharReference( $codepoint)
const EVIL_URI_PATTERN
Blacklist for evil uris like javascript: WARNING: DO NOT use this in any place that actually requires...
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
const ELEMENT_BITS_REGEX
Acceptable tag name charset from HTML5 parsing spec https://www.w3.org/TR/html5/syntax.html#tag-open-state.
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed, encoded as plain text.
static cssDecodeCallback( $matches)
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
static getTagAttributeCallback( $set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
static decodeChar( $codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
$wgFragmentMode
How should section IDs be encoded? This array can contain 1 or 2 elements, each of them can be one of...
static cleanUrlCallback( $matches)
static decodeTagAttributes( $text)
Return an associative array of attribute names and values from a partial tag string.
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
static validateTag( $params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
const XMLNS_ATTRIBUTE_PATTERN
static safeEncodeTagAttributes( $assoc_array)
Build a partial tag string from an associative array of attribute names and values as returned by dec...
static validateCodepoint( $codepoint)
Returns true if a given Unicode codepoint is a valid character in both HTML5 and XML.
static normalizeCharReferencesCallback( $matches)
static decodeCharReferencesCallback( $matches)
$wgAllowImageTag
A different approach to the above: simply allow the "<img>" tag to be used.
static encodeAttribute( $text)
Encode an attribute value for HTML output.
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static removeHTMLcomments( $text)
Remove '', and everything between.
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Throws a warning that $function is deprecated.
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string...
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static normalizeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the equivalent numeric entity re...
const const static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML...
static decodeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[], $warnCallback=null)
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments. ...
static validateAttributes( $attribs, $whitelist)
Take an array of attribute names and values and normalize or discard illegal values for the given whi...
static getAttribNameRegex()
Used in Sanitizer::decodeTagAttributes to filter attributes.
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(), for use in the id's that are used for section links.
static setupAttributeWhitelistInternal()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
static run( $event, array $args=[], $deprecatedVersion=null)
Call hook functions defined in Hooks::register and $wgHooks.
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing...