36 const CHAR_REFS_REGEX =
37 '/&([A-Za-z0-9\x80-\xff]+);
39 |&\#[xX]([0-9A-Fa-f]+);
46 const ELEMENT_BITS_REGEX =
'!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
56 const EVIL_URI_PATTERN =
'!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
57 const XMLNS_ATTRIBUTE_PATTERN =
"/^xmlns:[:A-Z_a-z-.0-9]+$/";
64 private static $htmlEntities = [
323 private static $htmlEntityAliases = [
331 private static $attribsRegex;
340 static function getAttribsRegex() {
341 if ( self::$attribsRegex ===
null ) {
342 $attribFirst =
'[:A-Z_a-z0-9]';
343 $attrib =
'[:A-Z_a-z-.0-9]';
344 $space =
'[\x09\x0a\x0c\x0d\x20]';
345 self::$attribsRegex =
346 "/(?:^|$space)({$attribFirst}{$attrib}*)
349 # The attribute value: quoted or alone
356 return self::$attribsRegex;
365 public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
368 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
369 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
373 $globalContext = $wgAllowImageTag;
374 if ( !$staticInitialised || $staticInitialised != $globalContext ) {
375 $htmlpairsStatic = [ # Tags
that must be closed
376 'b',
'bdi',
'del',
'i',
'ins',
'u',
'font',
'big',
'small',
'sub',
'sup',
'h1',
377 'h2',
'h3',
'h4',
'h5',
'h6',
'cite',
'code',
'em',
's',
378 'strike',
'strong',
'tt',
'var',
'div',
'center',
379 'blockquote',
'ol',
'ul',
'dl',
'table',
'caption',
'pre',
380 'ruby',
'rb',
'rp',
'rt',
'rtc',
'p',
'span',
'abbr',
'dfn',
381 'kbd',
'samp',
'data',
'time',
'mark'
384 'br',
'wbr',
'hr',
'li',
'dt',
'dd',
'meta',
'link'
387 # Elements that cannot have close tags. This is (not coincidentally)
388 # also the list of tags for which the HTML 5 parsing algorithm
389 # requires you to "acknowledge the token's self-closing flag", i.e.
390 # a self-closing tag like <br/> is not an HTML 5 parse error only
393 'br',
'wbr',
'hr',
'meta',
'link'
396 $htmlnest = [ # Tags
that can be nested--??
397 'table',
'tr',
'td',
'th',
'div',
'blockquote',
'ol',
'ul',
398 'li',
'dl',
'dt',
'dd',
'font',
'big',
'small',
'sub',
'sup',
'span',
399 'var',
'kbd',
'samp',
'em',
'strong',
'q',
'ruby',
'bdo'
401 $tabletags = [ # Can only appear inside
table, we
will close
them
411 if ( $wgAllowImageTag ) {
412 $htmlsingle[] =
'img';
413 $htmlsingleonly[] =
'img';
416 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
417 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
419 # Convert them all to hashtables for faster lookup
420 $vars = [
'htmlpairsStatic',
'htmlsingle',
'htmlsingleonly',
'htmlnest',
'tabletags',
421 'htmllist',
'listtags',
'htmlsingleallowed',
'htmlelementsStatic' ];
423 $$var = array_flip( $$var );
425 $staticInitialised = $globalContext;
428 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
429 $extratags = array_flip( $extratags );
430 $removetags = array_flip( $removetags );
431 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
432 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
435 'htmlpairs' => $htmlpairs,
436 'htmlsingle' => $htmlsingle,
437 'htmlsingleonly' => $htmlsingleonly,
438 'htmlnest' => $htmlnest,
439 'tabletags' => $tabletags,
440 'htmllist' => $htmllist,
441 'listtags' => $listtags,
442 'htmlsingleallowed' => $htmlsingleallowed,
443 'htmlelements' => $htmlelements,
462 public static function removeHTMLtags( $text, $processCallback =
null,
463 $args = [], $extratags = [], $removetags = [], $warnCallback =
null
465 extract( self::getRecognizedTagData( $extratags, $removetags ) );
467 # Remove HTML comments
468 $text = Sanitizer::removeHTMLcomments( $text );
469 $bits = explode(
'<', $text );
470 $text = str_replace(
'>',
'>', array_shift( $bits ) );
472 $tagstack = $tablestack = [];
473 foreach ( $bits
as $x ) {
475 # $slash: Does the current element start with a '/'?
476 # $t: Current element name
477 # $params: String between element name and >
478 # $brace: Ending '>' or '/>'
479 # $rest: Everything until the next element of $bits
480 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
483 $slash =
$t =
$params = $brace = $rest =
null;
487 $t = strtolower(
$t );
488 if ( isset( $htmlelements[
$t] ) ) {
490 if ( $slash && isset( $htmlsingleonly[
$t] ) ) {
492 } elseif ( $slash ) {
493 # Closing a tag... is it the one we just opened?
494 MediaWiki\suppressWarnings();
495 $ot = array_pop( $tagstack );
496 MediaWiki\restoreWarnings();
499 if ( isset( $htmlsingleallowed[$ot] ) ) {
500 # Pop all elements with an optional close tag
501 # and see if we find a match below them
503 array_push( $optstack, $ot );
504 MediaWiki\suppressWarnings();
505 $ot = array_pop( $tagstack );
506 MediaWiki\restoreWarnings();
507 while ( $ot !=
$t && isset( $htmlsingleallowed[$ot] ) ) {
508 array_push( $optstack, $ot );
509 MediaWiki\suppressWarnings();
510 $ot = array_pop( $tagstack );
511 MediaWiki\restoreWarnings();
514 # No match. Push the optional elements back again
516 MediaWiki\suppressWarnings();
517 $ot = array_pop( $optstack );
518 MediaWiki\restoreWarnings();
520 array_push( $tagstack, $ot );
521 MediaWiki\suppressWarnings();
522 $ot = array_pop( $optstack );
523 MediaWiki\restoreWarnings();
527 MediaWiki\suppressWarnings();
528 array_push( $tagstack, $ot );
529 MediaWiki\restoreWarnings();
531 # <li> can be nested in <ul> or <ol>, skip those cases:
532 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[
$t] ) ) {
537 if (
$t ==
'table' ) {
538 $tagstack = array_pop( $tablestack );
543 # Keep track for later
544 if ( isset( $tabletags[
$t] ) && !in_array(
'table', $tagstack ) ) {
546 } elseif ( in_array(
$t, $tagstack ) && !isset( $htmlnest[
$t] ) ) {
548 # Is it a self closed htmlpair ? (T7487)
549 } elseif ( $brace ==
'/>' && isset( $htmlpairs[
$t] ) ) {
555 if ( is_callable( $warnCallback ) ) {
556 call_user_func_array( $warnCallback, [
'deprecated-self-close-category' ] );
559 } elseif ( isset( $htmlsingleonly[
$t] ) ) {
560 # Hack to force empty tag for unclosable elements
562 } elseif ( isset( $htmlsingle[
$t] ) ) {
563 # Hack to not close $htmlsingle tags
565 # Still need to push this optionally-closed tag to
566 # the tag stack so that we can match end tags
567 # instead of marking them as bad.
568 array_push( $tagstack,
$t );
569 } elseif ( isset( $tabletags[
$t] ) && in_array(
$t, $tagstack ) ) {
573 if (
$t ==
'table' ) {
574 array_push( $tablestack, $tagstack );
577 array_push( $tagstack,
$t );
580 # Replace any variables or template parameters with
582 if ( is_callable( $processCallback ) ) {
583 call_user_func_array( $processCallback, [ &
$params,
$args ] );
586 if ( !Sanitizer::validateTag(
$params,
$t ) ) {
590 # Strip non-approved attributes from the tag
591 $newparams = Sanitizer::fixTagAttributes(
$params,
$t );
594 $rest = str_replace(
'>',
'>', $rest );
595 $close = ( $brace ==
'/>' && !$slash ) ?
' /' :
'';
596 $text .=
"<$slash$t$newparams$close>$rest";
600 $text .=
'<' . str_replace(
'>',
'>', $x );
602 # Close off any remaining tags
603 while ( is_array( $tagstack ) && (
$t = array_pop( $tagstack ) ) ) {
605 if (
$t ==
'table' ) {
606 $tagstack = array_pop( $tablestack );
610 # this might be possible using tidy itself
611 foreach ( $bits
as $x ) {
612 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
616 $t = strtolower(
$t );
617 if ( isset( $htmlelements[
$t] ) ) {
618 if ( is_callable( $processCallback ) ) {
619 call_user_func_array( $processCallback, [ &
$params,
$args ] );
622 if ( $brace ==
'/>' && !( isset( $htmlsingle[
$t] ) || isset( $htmlsingleonly[
$t] ) ) ) {
628 if ( is_callable( $warnCallback ) ) {
629 call_user_func_array( $warnCallback, [
'deprecated-self-close-category' ] );
632 if ( !Sanitizer::validateTag(
$params,
$t ) ) {
636 $newparams = Sanitizer::fixTagAttributes(
$params,
$t );
638 if ( $brace ===
'/>' && !isset( $htmlsingleonly[
$t] ) ) {
639 # Interpret self-closing tags as empty tags even when
640 # HTML 5 would interpret them as start tags. Such input
641 # is commonly seen on Wikimedia wikis with this intention.
645 $rest = str_replace(
'>',
'>', $rest );
646 $text .=
"<$slash$t$newparams$brace$rest";
651 $text .=
'<' . str_replace(
'>',
'>', $x );
666 public static function removeHTMLcomments( $text ) {
667 while ( ( $start = strpos( $text,
'<!--' ) ) !==
false ) {
668 $end = strpos( $text,
'-->', $start + 4 );
669 if ( $end ===
false ) {
670 # Unterminated comment; bail out
676 # Trim space and newline if the comment is both
677 # preceded and followed by a newline
678 $spaceStart = max( $start - 1, 0 );
679 $spaceLen = $end - $spaceStart;
680 while ( substr( $text, $spaceStart, 1 ) ===
' ' && $spaceStart > 0 ) {
684 while ( substr( $text, $spaceStart + $spaceLen, 1 ) ===
' ' ) {
687 if ( substr( $text, $spaceStart, 1 ) ===
"\n"
688 && substr( $text, $spaceStart + $spaceLen, 1 ) ===
"\n" ) {
689 # Remove the comment, leading and trailing
690 # spaces, and leave only one newline.
691 $text = substr_replace( $text,
"\n", $spaceStart, $spaceLen + 1 );
693 # Remove just the comment.
694 $text = substr_replace( $text,
'', $start, $end - $start );
712 static function validateTag(
$params, $element ) {
715 if ( $element ==
'meta' || $element ==
'link' ) {
716 if ( !isset(
$params[
'itemprop'] ) ) {
720 if ( $element ==
'meta' && !isset(
$params[
'content'] ) ) {
724 if ( $element ==
'link' && !isset(
$params[
'href'] ) ) {
748 static function validateTagAttributes(
$attribs, $element ) {
749 return Sanitizer::validateAttributes(
$attribs,
750 Sanitizer::attributeWhitelist( $element ) );
768 static function validateAttributes(
$attribs, $whitelist ) {
769 $whitelist = array_flip( $whitelist );
774 # Allow XML namespace declaration to allow RDFa
775 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
776 if ( !preg_match( self::EVIL_URI_PATTERN,
$value ) ) {
783 # Allow any attribute beginning with "data-"
785 # * data-ooui is reserved for ooui
786 # * data-mw and data-parsoid are reserved for parsoid
787 # * data-mw-<name here> is reserved for extensions (or core) if
788 # they need to communicate some data to the client and want to be
789 # sure that it isn't coming from an untrusted user.
790 # * Ensure that the attribute is not namespaced by banning
792 if ( !preg_match(
'/^data-(?!ooui|mw|parsoid)[^:]*$/i', $attribute )
793 && !isset( $whitelist[$attribute] )
798 # Strip javascript "expression" from stylesheets.
799 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
800 if ( $attribute ==
'style' ) {
804 # Escape HTML id attributes
805 if ( $attribute ===
'id' ) {
809 # Escape HTML id reference lists
810 if ( $attribute ===
'aria-describedby'
811 || $attribute ===
'aria-flowto'
812 || $attribute ===
'aria-labelledby'
813 || $attribute ===
'aria-owns'
815 $value = Sanitizer::escapeIdReferenceList(
$value,
'noninitial' );
820 if ( $attribute ===
'rel' || $attribute ===
'rev'
822 || $attribute ===
'about' || $attribute ===
'property'
823 || $attribute ===
'resource' || $attribute ===
'datatype'
824 || $attribute ===
'typeof'
826 || $attribute ===
'itemid' || $attribute ===
'itemprop'
827 || $attribute ===
'itemref' || $attribute ===
'itemscope'
828 || $attribute ===
'itemtype'
831 if ( preg_match( self::EVIL_URI_PATTERN,
$value ) ) {
836 # NOTE: even though elements using href/src are not allowed directly, supply
837 # validation code that can be used by tag hook handlers, etc
838 if ( $attribute ===
'href' || $attribute ===
'src' ) {
839 if ( !preg_match( $hrefExp,
$value ) ) {
850 # itemtype, itemid, itemref don't make sense without itemscope
851 if ( !array_key_exists(
'itemscope',
$out ) ) {
852 unset(
$out[
'itemtype'] );
853 unset(
$out[
'itemid'] );
854 unset(
$out[
'itemref'] );
856 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
871 static function mergeAttributes( $a, $b ) {
872 $out = array_merge( $a, $b );
873 if ( isset( $a[
'class'] ) && isset( $b[
'class'] )
874 && is_string( $a[
'class'] ) && is_string( $b[
'class'] )
875 && $a[
'class'] !== $b[
'class']
877 $classes = preg_split(
'/\s+/',
"{$a['class']} {$b['class']}",
878 -1, PREG_SPLIT_NO_EMPTY );
879 $out[
'class'] = implode(
' ', array_unique( $classes ) );
893 public static function normalizeCss(
$value ) {
908 if ( !$decodeRegex ) {
909 $space =
'[\\x20\\t\\r\\n\\f]';
910 $nl =
'(?:\\n|\\r\\n|\\r|\\f)';
912 $decodeRegex =
"/ $backslash
914 ($nl) | # 1. Line continuation
915 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
916 (.) | # 3. backslash cancelling special meaning
917 () | # 4. backslash at end of string
920 $value = preg_replace_callback( $decodeRegex,
921 [ __CLASS__,
'cssDecodeCallback' ],
$value );
924 $value = preg_replace_callback(
928 if ( $cp ===
false ) {
931 return chr( $cp - 65248 );
939 [
'ʀ',
'ɴ',
'ⁿ',
'ʟ',
'ɪ',
'⁽',
'₍' ],
940 [
'r',
'n',
'n',
'l',
'i',
'(',
'(' ],
947 if ( !preg_match(
'! ^ \s* /\* [^*\\/]* \*/ \s* $ !x',
$value ) ) {
958 $commentPos = strpos(
$value,
'/*' );
959 if ( $commentPos !==
false ) {
968 \xE3\x80\xB1 | # U+3031
969 \xE3\x82\x9D | # U+309D
970 \xE3\x83\xBC | # U+30FC
971 \xE3\x83\xBD | # U+30FD
972 \xEF\xB9\xBC | # U+FE7C
973 \xEF\xB9\xBD | # U+FE7D
974 \xEF\xBD\xB0 # U+FF70
1001 static function checkCss(
$value ) {
1005 if ( preg_match(
'/[\000-\010\013\016-\037\177]/',
$value ) ||
1007 return '/* invalid control char */';
1008 } elseif ( preg_match(
1013 | -o-link-source\s*:
1018 | attr\s*\([^)]+[\s,]+url
1020 return '/* insecure input */';
1029 static function cssDecodeCallback(
$matches ) {
1040 if ( $char ==
"\n" || $char ==
'"' || $char ==
"'" || $char ==
'\\' ) {
1043 return '\\' . dechex( ord( $char ) ) .
' ';
1071 static function fixTagAttributes( $text, $element, $sorted =
false ) {
1072 if ( trim( $text ) ==
'' ) {
1076 $decoded = Sanitizer::decodeTagAttributes( $text );
1077 $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
1083 return Sanitizer::safeEncodeTagAttributes( $stripped );
1091 static function encodeAttribute( $text ) {
1092 $encValue = htmlspecialchars( $text, ENT_QUOTES );
1097 $encValue = strtr( $encValue, [
1112 static function safeEncodeAttribute( $text ) {
1113 $encValue = Sanitizer::encodeAttribute( $text );
1115 # Templates and links may be expanded in later parsing,
1116 # creating invalid or dangerous output. Suppress this.
1117 $encValue = strtr( $encValue, [
1124 "''" =>
'''',
1125 'ISBN' =>
'ISBN',
1127 'PMID' =>
'PMID',
1133 $encValue = preg_replace_callback(
1135 [
'Sanitizer',
'armorLinksCallback' ],
1171 static function escapeId( $id,
$options = [] ) {
1172 global $wgExperimentalHtmlIds;
1175 $id = Sanitizer::decodeCharReferences( $id );
1177 if ( $wgExperimentalHtmlIds && !in_array(
'legacy',
$options ) ) {
1178 $id = preg_replace(
'/[ \t\n\r\f_\'"&#%]+/',
'_', $id );
1179 $id = trim( $id,
'_' );
1194 $id = urlencode( strtr( $id,
' ',
'_' ) );
1195 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
1197 if ( !preg_match(
'/^[a-zA-Z]/', $id ) && !in_array(
'noninitial',
$options ) ) {
1221 static function escapeIdReferenceList( $referenceString,
$options = [] ) {
1222 # Explode the space delimited list string into an array of tokens
1223 $references = preg_split(
'/\s+/',
"{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1225 # Escape each token as an id
1226 foreach ( $references
as &$ref ) {
1227 $ref = Sanitizer::escapeId( $ref,
$options );
1230 # Merge the array back to a space delimited list string
1231 # If the array is empty, the result will be an empty string ('')
1232 $referenceString = implode(
' ', $references );
1234 return $referenceString;
1248 static function escapeClass( $class ) {
1250 return rtrim( preg_replace(
1251 [
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/' ],
1263 static function escapeHtmlAllowEntities(
$html ) {
1264 $html = Sanitizer::decodeCharReferences(
$html );
1265 # It seems wise to escape ' as well as ", as a matter of course. Can't
1266 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1267 # don't cause the entire string to disappear.
1268 $html = htmlspecialchars(
$html, ENT_QUOTES | ENT_SUBSTITUTE );
1277 private static function armorLinksCallback(
$matches ) {
1278 return str_replace(
':',
':',
$matches[1] );
1289 public static function decodeTagAttributes( $text ) {
1290 if ( trim( $text ) ==
'' ) {
1296 if ( !preg_match_all(
1297 self::getAttribsRegex(),
1300 PREG_SET_ORDER ) ) {
1304 foreach ( $pairs
as $set ) {
1305 $attribute = strtolower( $set[1] );
1306 $value = Sanitizer::getTagAttributeCallback( $set );
1313 $attribs[$attribute] = Sanitizer::decodeCharReferences(
$value );
1325 public static function safeEncodeTagAttributes( $assoc_array ) {
1327 foreach ( $assoc_array
as $attribute =>
$value ) {
1328 $encAttribute = htmlspecialchars( $attribute );
1329 $encValue = Sanitizer::safeEncodeAttribute(
$value );
1331 $attribs[] =
"$encAttribute=\"$encValue\"";
1344 private static function getTagAttributeCallback( $set ) {
1345 if ( isset( $set[5] ) ) {
1348 } elseif ( isset( $set[4] ) ) {
1351 } elseif ( isset( $set[3] ) ) {
1354 } elseif ( !isset( $set[2] ) ) {
1355 # In XHTML, attributes must have a value so return an empty string.
1356 # See "Empty attribute syntax",
1357 # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1360 throw new MWException(
"Tag conditions not met. This should never happen and is a bug." );
1368 private static function normalizeWhitespace( $text ) {
1369 return preg_replace(
1370 '/\r\n|[\x20\x0d\x0a\x09]/',
1383 static function normalizeSectionNameWhitespace(
$section ) {
1384 return trim( preg_replace(
'/[ _]+/',
' ',
$section ) );
1402 static function normalizeCharReferences( $text ) {
1403 return preg_replace_callback(
1404 self::CHAR_REFS_REGEX,
1405 [
'Sanitizer',
'normalizeCharReferencesCallback' ],
1413 static function normalizeCharReferencesCallback(
$matches ) {
1422 if ( is_null(
$ret ) ) {
1423 return htmlspecialchars(
$matches[0] );
1439 static function normalizeEntity(
$name ) {
1440 if ( isset( self::$htmlEntityAliases[
$name] ) ) {
1441 return '&' . self::$htmlEntityAliases[
$name] .
';';
1442 } elseif ( in_array(
$name, [
'lt',
'gt',
'amp',
'quot' ] ) ) {
1444 } elseif ( isset( self::$htmlEntities[
$name] ) ) {
1445 return '&#' . self::$htmlEntities[
$name] .
';';
1447 return "&$name;";
1455 static function decCharReference( $codepoint ) {
1456 $point = intval( $codepoint );
1457 if ( Sanitizer::validateCodepoint( $point ) ) {
1458 return sprintf(
'&#%d;', $point );
1468 static function hexCharReference( $codepoint ) {
1469 $point = hexdec( $codepoint );
1470 if ( Sanitizer::validateCodepoint( $point ) ) {
1471 return sprintf(
'&#x%x;', $point );
1483 private static function validateCodepoint( $codepoint ) {
1484 # U+000C is valid in HTML5 but not allowed in XML.
1485 # U+000D is valid in XML but not allowed in HTML5.
1486 # U+007F - U+009F are disallowed in HTML5 (control characters).
1487 return $codepoint == 0x09
1488 || $codepoint == 0x0a
1489 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1490 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1491 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1492 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1502 public static function decodeCharReferences( $text ) {
1503 return preg_replace_callback(
1504 self::CHAR_REFS_REGEX,
1505 [
'Sanitizer',
'decodeCharReferencesCallback' ],
1519 public static function decodeCharReferencesAndNormalize( $text ) {
1521 $text = preg_replace_callback(
1522 self::CHAR_REFS_REGEX,
1523 [
'Sanitizer',
'decodeCharReferencesCallback' ],
1524 $text, -1, $count );
1537 static function decodeCharReferencesCallback(
$matches ) {
1539 return Sanitizer::decodeEntity(
$matches[1] );
1541 return Sanitizer::decodeChar( intval(
$matches[2] ) );
1543 return Sanitizer::decodeChar( hexdec(
$matches[3] ) );
1545 # Last case should be an ampersand by itself
1556 static function decodeChar( $codepoint ) {
1557 if ( Sanitizer::validateCodepoint( $codepoint ) ) {
1572 static function decodeEntity(
$name ) {
1573 if ( isset( self::$htmlEntityAliases[
$name] ) ) {
1576 if ( isset( self::$htmlEntities[
$name] ) ) {
1589 static function attributeWhitelist( $element ) {
1590 $list = Sanitizer::setupAttributeWhitelist();
1591 return isset( $list[$element] )
1601 static function setupAttributeWhitelist() {
1604 if ( $whitelist !==
null ) {
1626 # These attributes are specified in section 9 of
1627 # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1634 # Microdata. These are specified by
1635 # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1643 $block = array_merge( $common, [
'align' ] );
1644 $tablealign = [
'align',
'valign' ];
1652 'nowrap', # deprecated
1653 'width', # deprecated
1654 'height', # deprecated
1655 'bgcolor', # deprecated
1658 # Numbers refer to sections in HTML 4.01 standard describing the element.
1659 # See: https://www.w3.org/TR/html4/
1663 'center' => $common, # deprecated
1682 'strong' => $common,
1693 'blockquote' => array_merge( $common, [
'cite' ] ),
1694 'q' => array_merge( $common, [
'cite' ] ),
1704 'br' => array_merge( $common, [
'clear' ] ),
1706 # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1710 'pre' => array_merge( $common, [
'width' ] ),
1713 'ins' => array_merge( $common, [
'cite',
'datetime' ] ),
1714 'del' => array_merge( $common, [
'cite',
'datetime' ] ),
1717 'ul' => array_merge( $common, [
'type' ] ),
1718 'ol' => array_merge( $common, [
'type',
'start',
'reversed' ] ),
1719 'li' => array_merge( $common, [
'type',
'value' ] ),
1727 'table' => array_merge( $common,
1728 [
'summary',
'width',
'border',
'frame',
1729 'rules',
'cellspacing',
'cellpadding',
1734 'caption' => $block,
1742 'colgroup' => array_merge( $common, [
'span' ] ),
1743 'col' => array_merge( $common, [
'span' ] ),
1746 'tr' => array_merge( $common, [
'bgcolor' ], $tablealign ),
1749 'td' => array_merge( $common, $tablecell, $tablealign ),
1750 'th' => array_merge( $common, $tablecell, $tablealign ),
1753 # NOTE: <a> is not allowed directly, but the attrib
1754 # whitelist is used from the Parser object
1755 'a' => array_merge( $common, [
'href',
'rel',
'rev' ] ), # rel/rev esp.
for RDFa
1758 # Not usually allowed, but may be used for extension-style hooks
1759 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1761 'img' => array_merge( $common, [
'alt',
'src',
'width',
'height' ] ),
1769 'strike' => $common,
1774 'font' => array_merge( $common, [
'size',
'color',
'face' ] ),
1778 'hr' => array_merge( $common, [
'width' ] ),
1780 # HTML Ruby annotation text module, simple ruby only.
1781 # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1786 'rt' => $common, # array_merge( $common,
array(
'rbspan' ) ),
1789 # MathML root element, where used for extensions
1790 # 'title' may not be 100% valid here; it's XHTML
1791 # https://www.w3.org/TR/REC-MathML/
1792 'math' => [
'class',
'style',
'id',
'title' ],
1794 # HTML 5 section 4.6
1797 # HTML5 elements, defined by:
1798 # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1799 'data' => array_merge( $common, [
'value' ] ),
1800 'time' => array_merge( $common, [
'datetime' ] ),
1808 'meta' => [
'itemprop',
'content' ],
1809 'link' => [
'itemprop',
'href' ],
1825 static function stripAllTags( $text ) {
1829 # Normalize &entities and whitespace
1830 $text = self::decodeCharReferences( $text );
1831 $text = self::normalizeWhitespace( $text );
1845 static function hackDocType() {
1846 $out =
"<!DOCTYPE html [\n";
1847 foreach ( self::$htmlEntities
as $entity => $codepoint ) {
1848 $out .=
"<!ENTITY $entity \"&#$codepoint;\">";
1858 static function cleanUrl( $url ) {
1859 # Normalize any HTML entities in input. They will be
1860 # re-escaped by makeExternalLink().
1861 $url = Sanitizer::decodeCharReferences( $url );
1863 # Escape any control characters introduced by the above step
1864 $url = preg_replace_callback(
'/[\][<>"\\x00-\\x20\\x7F\|]/',
1865 [ __CLASS__,
'cleanUrlCallback' ], $url );
1867 # Validate hostname portion
1869 if ( preg_match(
'!^([^:]+:)(//[^/]+)?(.*)$!iD', $url,
$matches ) ) {
1876 \\s| # general whitespace
1877 \xc2\xad| # 00ad SOFT HYPHEN
1878 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1879 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1880 \xe2\x81\xa0| # 2060 WORD JOINER
1881 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1882 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1883 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1884 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1885 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1886 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1887 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1888 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
1891 $host = preg_replace( $strip,
'', $host );
1894 if ( substr_compare(
"//%5B", $host, 0, 5 ) === 0 &&
1895 preg_match(
'!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host,
$matches )
1902 return $protocol . $host . $rest;
1912 static function cleanUrlCallback(
$matches ) {
1944 public static function validateEmail( $addr ) {
1953 $rfc5322_atext =
"a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1954 $rfc1034_ldh_str =
"a-z0-9\\-";
1956 $html5_email_regexp =
"/
1958 [$rfc5322_atext\\.]+ # user part which is liberal :p
1960 [$rfc1034_ldh_str]+ # First domain part
1961 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1965 return (
bool)preg_match( $html5_email_regexp, $addr );