36 const CHAR_REFS_REGEX =
37 '/&([A-Za-z0-9\x80-\xff]+);
39 |&\#[xX]([0-9A-Fa-f]+);
46 const ELEMENT_BITS_REGEX =
'!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
56 const EVIL_URI_PATTERN =
'!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
57 const XMLNS_ATTRIBUTE_PATTERN =
"/^xmlns:[:A-Z_a-z-.0-9]+$/";
72 const ID_FALLBACK = 1;
79 private static $htmlEntities = [
338 private static $htmlEntityAliases = [
346 private static $attribsRegex;
355 static function getAttribsRegex() {
356 if ( self::$attribsRegex ===
null ) {
357 $attribFirst =
"[:_\p{L}\p{N}]";
358 $attrib =
"[:_\.\-\p{L}\p{N}]";
359 $space =
'[\x09\x0a\x0c\x0d\x20]';
360 self::$attribsRegex =
361 "/(?:^|$space)({$attribFirst}{$attrib}*)
364 # The attribute value: quoted or alone
369 )?(?=$space|\$)/sxu";
371 return self::$attribsRegex;
380 public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
383 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
384 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
389 if ( !$staticInitialised || $staticInitialised != $globalContext ) {
390 $htmlpairsStatic = [ # Tags
that must be closed
391 'b',
'bdi',
'del',
'i',
'ins',
'u',
'font',
'big',
'small',
'sub',
'sup',
'h1',
392 'h2',
'h3',
'h4',
'h5',
'h6',
'cite',
'code',
'em',
's',
393 'strike',
'strong',
'tt',
'var',
'div',
'center',
394 'blockquote',
'ol',
'ul',
'dl',
'table',
'caption',
'pre',
395 'ruby',
'rb',
'rp',
'rt',
'rtc',
'p',
'span',
'abbr',
'dfn',
396 'kbd',
'samp',
'data',
'time',
'mark'
399 'br',
'wbr',
'hr',
'li',
'dt',
'dd',
'meta',
'link'
402 # Elements that cannot have close tags. This is (not coincidentally)
403 # also the list of tags for which the HTML 5 parsing algorithm
404 # requires you to "acknowledge the token's self-closing flag", i.e.
405 # a self-closing tag like <br/> is not an HTML 5 parse error only
408 'br',
'wbr',
'hr',
'meta',
'link'
411 $htmlnest = [ # Tags
that can be nested--??
412 'table',
'tr',
'td',
'th',
'div',
'blockquote',
'ol',
'ul',
413 'li',
'dl',
'dt',
'dd',
'font',
'big',
'small',
'sub',
'sup',
'span',
414 'var',
'kbd',
'samp',
'em',
'strong',
'q',
'ruby',
'bdo'
416 $tabletags = [ # Can only appear inside
table, we
will close
them
427 $htmlsingle[] =
'img';
428 $htmlsingleonly[] =
'img';
431 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
432 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
434 # Convert them all to hashtables for faster lookup
435 $vars = [
'htmlpairsStatic',
'htmlsingle',
'htmlsingleonly',
'htmlnest',
'tabletags',
436 'htmllist',
'listtags',
'htmlsingleallowed',
'htmlelementsStatic' ];
438 $$var = array_flip( $$var );
440 $staticInitialised = $globalContext;
443 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
444 $extratags = array_flip( $extratags );
445 $removetags = array_flip( $removetags );
446 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
447 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
450 'htmlpairs' => $htmlpairs,
451 'htmlsingle' => $htmlsingle,
452 'htmlsingleonly' => $htmlsingleonly,
453 'htmlnest' => $htmlnest,
454 'tabletags' => $tabletags,
455 'htmllist' => $htmllist,
456 'listtags' => $listtags,
457 'htmlsingleallowed' => $htmlsingleallowed,
458 'htmlelements' => $htmlelements,
477 public static function removeHTMLtags( $text, $processCallback =
null,
478 $args = [], $extratags = [], $removetags = [], $warnCallback =
null
480 extract( self::getRecognizedTagData( $extratags, $removetags ) );
482 # Remove HTML comments
483 $text = self::removeHTMLcomments( $text );
484 $bits = explode(
'<', $text );
485 $text = str_replace(
'>',
'>', array_shift( $bits ) );
487 $tagstack = $tablestack = [];
488 foreach ( $bits
as $x ) {
490 # $slash: Does the current element start with a '/'?
491 # $t: Current element name
492 # $params: String between element name and >
493 # $brace: Ending '>' or '/>'
494 # $rest: Everything until the next element of $bits
495 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
498 $slash =
$t =
$params = $brace = $rest =
null;
502 $t = strtolower(
$t );
503 if ( isset( $htmlelements[
$t] ) ) {
505 if ( $slash && isset( $htmlsingleonly[
$t] ) ) {
507 } elseif ( $slash ) {
508 # Closing a tag... is it the one we just opened?
509 MediaWiki\suppressWarnings();
510 $ot = array_pop( $tagstack );
511 MediaWiki\restoreWarnings();
514 if ( isset( $htmlsingleallowed[$ot] ) ) {
515 # Pop all elements with an optional close tag
516 # and see if we find a match below them
518 array_push( $optstack, $ot );
519 MediaWiki\suppressWarnings();
520 $ot = array_pop( $tagstack );
521 MediaWiki\restoreWarnings();
522 while ( $ot !=
$t && isset( $htmlsingleallowed[$ot] ) ) {
523 array_push( $optstack, $ot );
524 MediaWiki\suppressWarnings();
525 $ot = array_pop( $tagstack );
526 MediaWiki\restoreWarnings();
529 # No match. Push the optional elements back again
531 MediaWiki\suppressWarnings();
532 $ot = array_pop( $optstack );
533 MediaWiki\restoreWarnings();
535 array_push( $tagstack, $ot );
536 MediaWiki\suppressWarnings();
537 $ot = array_pop( $optstack );
538 MediaWiki\restoreWarnings();
542 MediaWiki\suppressWarnings();
543 array_push( $tagstack, $ot );
544 MediaWiki\restoreWarnings();
546 # <li> can be nested in <ul> or <ol>, skip those cases:
547 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[
$t] ) ) {
552 if (
$t ==
'table' ) {
553 $tagstack = array_pop( $tablestack );
558 # Keep track for later
559 if ( isset( $tabletags[
$t] ) && !in_array(
'table', $tagstack ) ) {
561 } elseif ( in_array(
$t, $tagstack ) && !isset( $htmlnest[
$t] ) ) {
563 # Is it a self closed htmlpair ? (T7487)
564 } elseif ( $brace ==
'/>' && isset( $htmlpairs[
$t] ) ) {
570 if ( is_callable( $warnCallback ) ) {
571 call_user_func_array( $warnCallback, [
'deprecated-self-close-category' ] );
574 } elseif ( isset( $htmlsingleonly[
$t] ) ) {
575 # Hack to force empty tag for unclosable elements
577 } elseif ( isset( $htmlsingle[
$t] ) ) {
578 # Hack to not close $htmlsingle tags
580 # Still need to push this optionally-closed tag to
581 # the tag stack so that we can match end tags
582 # instead of marking them as bad.
583 array_push( $tagstack,
$t );
584 } elseif ( isset( $tabletags[
$t] ) && in_array(
$t, $tagstack ) ) {
588 if (
$t ==
'table' ) {
589 array_push( $tablestack, $tagstack );
592 array_push( $tagstack,
$t );
595 # Replace any variables or template parameters with
597 if ( is_callable( $processCallback ) ) {
598 call_user_func_array( $processCallback, [ &
$params,
$args ] );
601 if ( !self::validateTag(
$params,
$t ) ) {
605 # Strip non-approved attributes from the tag
606 $newparams = self::fixTagAttributes(
$params,
$t );
609 $rest = str_replace(
'>',
'>', $rest );
610 $close = ( $brace ==
'/>' && !$slash ) ?
' /' :
'';
611 $text .=
"<$slash$t$newparams$close>$rest";
615 $text .=
'<' . str_replace(
'>',
'>', $x );
617 # Close off any remaining tags
618 while ( is_array( $tagstack ) && (
$t = array_pop( $tagstack ) ) ) {
620 if (
$t ==
'table' ) {
621 $tagstack = array_pop( $tablestack );
625 # this might be possible using tidy itself
626 foreach ( $bits
as $x ) {
627 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
631 $t = strtolower(
$t );
632 if ( isset( $htmlelements[
$t] ) ) {
633 if ( is_callable( $processCallback ) ) {
634 call_user_func_array( $processCallback, [ &
$params,
$args ] );
637 if ( $brace ==
'/>' && !( isset( $htmlsingle[
$t] ) || isset( $htmlsingleonly[
$t] ) ) ) {
643 if ( is_callable( $warnCallback ) ) {
644 call_user_func_array( $warnCallback, [
'deprecated-self-close-category' ] );
647 if ( !self::validateTag(
$params,
$t ) ) {
651 $newparams = self::fixTagAttributes(
$params,
$t );
653 if ( $brace ===
'/>' && !isset( $htmlsingleonly[
$t] ) ) {
654 # Interpret self-closing tags as empty tags even when
655 # HTML 5 would interpret them as start tags. Such input
656 # is commonly seen on Wikimedia wikis with this intention.
660 $rest = str_replace(
'>',
'>', $rest );
661 $text .=
"<$slash$t$newparams$brace$rest";
666 $text .=
'<' . str_replace(
'>',
'>', $x );
681 public static function removeHTMLcomments( $text ) {
682 while ( ( $start = strpos( $text,
'<!--' ) ) !==
false ) {
683 $end = strpos( $text,
'-->', $start + 4 );
684 if ( $end ===
false ) {
685 # Unterminated comment; bail out
691 # Trim space and newline if the comment is both
692 # preceded and followed by a newline
693 $spaceStart = max( $start - 1, 0 );
694 $spaceLen = $end - $spaceStart;
695 while ( substr( $text, $spaceStart, 1 ) ===
' ' && $spaceStart > 0 ) {
699 while ( substr( $text, $spaceStart + $spaceLen, 1 ) ===
' ' ) {
702 if ( substr( $text, $spaceStart, 1 ) ===
"\n"
703 && substr( $text, $spaceStart + $spaceLen, 1 ) ===
"\n" ) {
704 # Remove the comment, leading and trailing
705 # spaces, and leave only one newline.
706 $text = substr_replace( $text,
"\n", $spaceStart, $spaceLen + 1 );
708 # Remove just the comment.
709 $text = substr_replace( $text,
'', $start, $end - $start );
727 static function validateTag(
$params, $element ) {
730 if ( $element ==
'meta' || $element ==
'link' ) {
731 if ( !isset(
$params[
'itemprop'] ) ) {
735 if ( $element ==
'meta' && !isset(
$params[
'content'] ) ) {
739 if ( $element ==
'link' && !isset(
$params[
'href'] ) ) {
763 static function validateTagAttributes(
$attribs, $element ) {
764 return self::validateAttributes(
$attribs,
765 self::attributeWhitelist( $element ) );
783 static function validateAttributes(
$attribs, $whitelist ) {
784 $whitelist = array_flip( $whitelist );
789 # Allow XML namespace declaration to allow RDFa
790 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
791 if ( !preg_match( self::EVIL_URI_PATTERN,
$value ) ) {
798 # Allow any attribute beginning with "data-"
800 # * Disallow data attributes used by MediaWiki code
801 # * Ensure that the attribute is not namespaced by banning
803 if ( !preg_match(
'/^data-[^:]*$/i', $attribute )
804 && !isset( $whitelist[$attribute] )
805 || self::isReservedDataAttribute( $attribute )
810 # Strip javascript "expression" from stylesheets.
811 # https://msdn.microsoft.com/en-us/library/ms537634.aspx
812 if ( $attribute ==
'style' ) {
816 # Escape HTML id attributes
817 if ( $attribute ===
'id' ) {
818 $value = self::escapeIdForAttribute(
$value, self::ID_PRIMARY );
821 # Escape HTML id reference lists
822 if ( $attribute ===
'aria-describedby'
823 || $attribute ===
'aria-flowto'
824 || $attribute ===
'aria-labelledby'
825 || $attribute ===
'aria-owns'
827 $value = self::escapeIdReferenceList(
$value,
'noninitial' );
832 if ( $attribute ===
'rel' || $attribute ===
'rev'
834 || $attribute ===
'about' || $attribute ===
'property'
835 || $attribute ===
'resource' || $attribute ===
'datatype'
836 || $attribute ===
'typeof'
838 || $attribute ===
'itemid' || $attribute ===
'itemprop'
839 || $attribute ===
'itemref' || $attribute ===
'itemscope'
840 || $attribute ===
'itemtype'
843 if ( preg_match( self::EVIL_URI_PATTERN,
$value ) ) {
848 # NOTE: even though elements using href/src are not allowed directly, supply
849 # validation code that can be used by tag hook handlers, etc
850 if ( $attribute ===
'href' || $attribute ===
'src' || $attribute ===
'poster' ) {
851 if ( !preg_match( $hrefExp,
$value ) ) {
862 # itemtype, itemid, itemref don't make sense without itemscope
863 if ( !array_key_exists(
'itemscope',
$out ) ) {
864 unset(
$out[
'itemtype'] );
865 unset(
$out[
'itemid'] );
866 unset(
$out[
'itemref'] );
868 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
880 public static function isReservedDataAttribute( $attr ) {
888 return (
bool)preg_match(
'/^data-(ooui|mw|parsoid)/i', $attr );
901 static function mergeAttributes( $a, $b ) {
902 $out = array_merge( $a, $b );
903 if ( isset( $a[
'class'] ) && isset( $b[
'class'] )
904 && is_string( $a[
'class'] ) && is_string( $b[
'class'] )
905 && $a[
'class'] !== $b[
'class']
907 $classes = preg_split(
'/\s+/',
"{$a['class']} {$b['class']}",
908 -1, PREG_SPLIT_NO_EMPTY );
909 $out[
'class'] = implode(
' ', array_unique( $classes ) );
923 public static function normalizeCss(
$value ) {
937 if ( !$decodeRegex ) {
938 $space =
'[\\x20\\t\\r\\n\\f]';
939 $nl =
'(?:\\n|\\r\\n|\\r|\\f)';
941 $decodeRegex =
"/ $backslash
943 ($nl) | # 1. Line continuation
944 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
945 (.) | # 3. backslash cancelling special meaning
946 () | # 4. backslash at end of string
949 $value = preg_replace_callback( $decodeRegex,
950 [ __CLASS__,
'cssDecodeCallback' ],
$value );
953 $value = preg_replace_callback(
957 if ( $cp ===
false ) {
960 return chr( $cp - 65248 );
968 [
'ʀ',
'ɴ',
'ⁿ',
'ʟ',
'ɪ',
'⁽',
'₍' ],
969 [
'r',
'n',
'n',
'l',
'i',
'(',
'(' ],
976 if ( !preg_match(
'! ^ \s* /\* [^*\\/]* \*/ \s* $ !x',
$value ) ) {
987 $commentPos = strpos(
$value,
'/*' );
988 if ( $commentPos !==
false ) {
997 \xE3\x80\xB1 | # U+3031
998 \xE3\x82\x9D | # U+309D
999 \xE3\x83\xBC | # U+30FC
1000 \xE3\x83\xBD | # U+30FD
1001 \xEF\xB9\xBC | # U+FE7C
1002 \xEF\xB9\xBD | # U+FE7D
1003 \xEF\xBD\xB0 # U+FF70
1030 static function checkCss(
$value ) {
1034 if ( preg_match(
'/[\000-\010\013\016-\037\177]/',
$value ) ||
1036 return '/* invalid control char */';
1037 } elseif ( preg_match(
1042 | -o-link-source\s*:
1047 | attr\s*\([^)]+[\s,]+url
1049 return '/* insecure input */';
1058 static function cssDecodeCallback(
$matches ) {
1069 if ( $char ==
"\n" || $char ==
'"' || $char ==
"'" || $char ==
'\\' ) {
1072 return '\\' . dechex( ord( $char ) ) .
' ';
1100 static function fixTagAttributes( $text, $element, $sorted =
false ) {
1101 if ( trim( $text ) ==
'' ) {
1105 $decoded = self::decodeTagAttributes( $text );
1106 $stripped = self::validateTagAttributes( $decoded, $element );
1112 return self::safeEncodeTagAttributes( $stripped );
1120 static function encodeAttribute( $text ) {
1121 $encValue = htmlspecialchars( $text, ENT_QUOTES );
1126 $encValue = strtr( $encValue, [
1141 static function safeEncodeAttribute( $text ) {
1142 $encValue = self::encodeAttribute( $text );
1144 # Templates and links may be expanded in later parsing,
1145 # creating invalid or dangerous output. Suppress this.
1146 $encValue = strtr( $encValue, [
1153 "''" =>
'''',
1154 'ISBN' =>
'ISBN',
1156 'PMID' =>
'PMID',
1162 $encValue = preg_replace_callback(
1164 [
'Sanitizer',
'armorLinksCallback' ],
1202 static function escapeId( $id,
$options = [] ) {
1207 $id = preg_replace(
'/[ \t\n\r\f_\'"&#%]+/',
'_', $id );
1208 $id = trim( $id,
'_' );
1223 $id = urlencode( strtr( $id,
' ',
'_' ) );
1224 $id = strtr( $id, $replace );
1226 if ( !preg_match(
'/^[a-zA-Z]/', $id ) && !in_array(
'noninitial',
$options ) ) {
1248 public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
1252 if ( $mode === self::ID_PRIMARY ) {
1253 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
1260 return self::escapeIdInternal( $id, $internalMode );
1275 public static function escapeIdForLink( $id ) {
1279 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
1284 $id = self::escapeIdInternal( $id, $mode );
1298 public static function escapeIdForExternalInterwiki( $id ) {
1313 private static function escapeIdInternal( $id, $mode ) {
1316 $id = str_replace(
' ',
'_', $id );
1325 $id = urlencode( str_replace(
' ',
'_', $id ) );
1326 $id = strtr( $id, $replace );
1328 case 'html5-legacy':
1329 $id = preg_replace(
'/[ \t\n\r\f_\'"&#%]+/',
'_', $id );
1330 $id = trim( $id,
'_' );
1337 throw new InvalidArgumentException(
"Invalid mode '$mode' passed to '" . __METHOD__ );
1355 static function escapeIdReferenceList( $referenceString,
$options = [] ) {
1356 # Explode the space delimited list string into an array of tokens
1357 $references = preg_split(
'/\s+/',
"{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1359 # Escape each token as an id
1360 foreach ( $references
as &$ref ) {
1361 $ref = self::escapeIdForAttribute( $ref );
1364 # Merge the array back to a space delimited list string
1365 # If the array is empty, the result will be an empty string ('')
1366 $referenceString = implode(
' ', $references );
1368 return $referenceString;
1382 static function escapeClass( $class ) {
1384 return rtrim( preg_replace(
1385 [
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/' ],
1397 static function escapeHtmlAllowEntities(
$html ) {
1399 # It seems wise to escape ' as well as ", as a matter of course. Can't
1400 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1401 # don't cause the entire string to disappear.
1402 $html = htmlspecialchars(
$html, ENT_QUOTES | ENT_SUBSTITUTE );
1411 private static function armorLinksCallback(
$matches ) {
1412 return str_replace(
':',
':',
$matches[1] );
1423 public static function decodeTagAttributes( $text ) {
1424 if ( trim( $text ) ==
'' ) {
1430 if ( !preg_match_all(
1431 self::getAttribsRegex(),
1434 PREG_SET_ORDER ) ) {
1438 foreach ( $pairs
as $set ) {
1439 $attribute = strtolower( $set[1] );
1440 $value = self::getTagAttributeCallback( $set );
1459 public static function safeEncodeTagAttributes( $assoc_array ) {
1461 foreach ( $assoc_array
as $attribute =>
$value ) {
1462 $encAttribute = htmlspecialchars( $attribute );
1463 $encValue = self::safeEncodeAttribute(
$value );
1465 $attribs[] =
"$encAttribute=\"$encValue\"";
1478 private static function getTagAttributeCallback( $set ) {
1479 if ( isset( $set[5] ) ) {
1482 } elseif ( isset( $set[4] ) ) {
1485 } elseif ( isset( $set[3] ) ) {
1488 } elseif ( !isset( $set[2] ) ) {
1489 # In XHTML, attributes must have a value so return an empty string.
1490 # See "Empty attribute syntax",
1491 # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1494 throw new MWException(
"Tag conditions not met. This should never happen and is a bug." );
1502 private static function normalizeWhitespace( $text ) {
1503 return preg_replace(
1504 '/\r\n|[\x20\x0d\x0a\x09]/',
1517 static function normalizeSectionNameWhitespace(
$section ) {
1518 return trim( preg_replace(
'/[ _]+/',
' ',
$section ) );
1536 static function normalizeCharReferences( $text ) {
1537 return preg_replace_callback(
1538 self::CHAR_REFS_REGEX,
1539 [
'Sanitizer',
'normalizeCharReferencesCallback' ],
1547 static function normalizeCharReferencesCallback(
$matches ) {
1556 if ( is_null(
$ret ) ) {
1557 return htmlspecialchars(
$matches[0] );
1573 static function normalizeEntity(
$name ) {
1574 if ( isset( self::$htmlEntityAliases[
$name] ) ) {
1575 return '&' . self::$htmlEntityAliases[
$name] .
';';
1576 } elseif ( in_array(
$name, [
'lt',
'gt',
'amp',
'quot' ] ) ) {
1578 } elseif ( isset( self::$htmlEntities[
$name] ) ) {
1579 return '&#' . self::$htmlEntities[
$name] .
';';
1581 return "&$name;";
1589 static function decCharReference( $codepoint ) {
1590 $point = intval( $codepoint );
1591 if ( self::validateCodepoint( $point ) ) {
1592 return sprintf(
'&#%d;', $point );
1602 static function hexCharReference( $codepoint ) {
1603 $point = hexdec( $codepoint );
1604 if ( self::validateCodepoint( $point ) ) {
1605 return sprintf(
'&#x%x;', $point );
1617 private static function validateCodepoint( $codepoint ) {
1618 # U+000C is valid in HTML5 but not allowed in XML.
1619 # U+000D is valid in XML but not allowed in HTML5.
1620 # U+007F - U+009F are disallowed in HTML5 (control characters).
1621 return $codepoint == 0x09
1622 || $codepoint == 0x0a
1623 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1624 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1625 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1626 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1636 public static function decodeCharReferences( $text ) {
1637 return preg_replace_callback(
1638 self::CHAR_REFS_REGEX,
1639 [
'Sanitizer',
'decodeCharReferencesCallback' ],
1653 public static function decodeCharReferencesAndNormalize( $text ) {
1655 $text = preg_replace_callback(
1656 self::CHAR_REFS_REGEX,
1657 [
'Sanitizer',
'decodeCharReferencesCallback' ],
1674 static function decodeCharReferencesCallback(
$matches ) {
1676 return self::decodeEntity(
$matches[1] );
1678 return self::decodeChar( intval(
$matches[2] ) );
1680 return self::decodeChar( hexdec(
$matches[3] ) );
1682 # Last case should be an ampersand by itself
1693 static function decodeChar( $codepoint ) {
1694 if ( self::validateCodepoint( $codepoint ) ) {
1709 static function decodeEntity(
$name ) {
1710 if ( isset( self::$htmlEntityAliases[
$name] ) ) {
1713 if ( isset( self::$htmlEntities[
$name] ) ) {
1726 static function attributeWhitelist( $element ) {
1727 $list = self::setupAttributeWhitelist();
1728 return isset( $list[$element] )
1738 static function setupAttributeWhitelist() {
1741 if ( $whitelist !==
null ) {
1763 # These attributes are specified in section 9 of
1764 # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1771 # Microdata. These are specified by
1772 # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1780 $block = array_merge( $common, [
'align' ] );
1781 $tablealign = [
'align',
'valign' ];
1789 'nowrap', # deprecated
1790 'width', # deprecated
1791 'height', # deprecated
1792 'bgcolor', # deprecated
1795 # Numbers refer to sections in HTML 4.01 standard describing the element.
1796 # See: https://www.w3.org/TR/html4/
1800 'center' => $common, # deprecated
1819 'strong' => $common,
1830 'blockquote' => array_merge( $common, [
'cite' ] ),
1831 'q' => array_merge( $common, [
'cite' ] ),
1841 'br' => array_merge( $common, [
'clear' ] ),
1843 # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1847 'pre' => array_merge( $common, [
'width' ] ),
1850 'ins' => array_merge( $common, [
'cite',
'datetime' ] ),
1851 'del' => array_merge( $common, [
'cite',
'datetime' ] ),
1854 'ul' => array_merge( $common, [
'type' ] ),
1855 'ol' => array_merge( $common, [
'type',
'start',
'reversed' ] ),
1856 'li' => array_merge( $common, [
'type',
'value' ] ),
1864 'table' => array_merge( $common,
1865 [
'summary',
'width',
'border',
'frame',
1866 'rules',
'cellspacing',
'cellpadding',
1871 'caption' => $block,
1879 'colgroup' => array_merge( $common, [
'span' ] ),
1880 'col' => array_merge( $common, [
'span' ] ),
1883 'tr' => array_merge( $common, [
'bgcolor' ], $tablealign ),
1886 'td' => array_merge( $common, $tablecell, $tablealign ),
1887 'th' => array_merge( $common, $tablecell, $tablealign ),
1890 # NOTE: <a> is not allowed directly, but the attrib
1891 # whitelist is used from the Parser object
1892 'a' => array_merge( $common, [
'href',
'rel',
'rev' ] ), # rel/rev esp.
for RDFa
1895 # Not usually allowed, but may be used for extension-style hooks
1896 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1898 'img' => array_merge( $common, [
'alt',
'src',
'width',
'height',
'srcset' ] ),
1900 'video' => array_merge( $common, [
'poster',
'controls',
'preload',
'width',
'height' ] ),
1901 'source' => array_merge( $common, [
'type',
'src' ] ),
1902 'track' => array_merge( $common, [
'type',
'src',
'srclang',
'kind',
'label' ] ),
1910 'strike' => $common,
1915 'font' => array_merge( $common, [
'size',
'color',
'face' ] ),
1919 'hr' => array_merge( $common, [
'width' ] ),
1921 # HTML Ruby annotation text module, simple ruby only.
1922 # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1927 'rt' => $common, # array_merge( $common,
array(
'rbspan' ) ),
1930 # MathML root element, where used for extensions
1931 # 'title' may not be 100% valid here; it's XHTML
1932 # https://www.w3.org/TR/REC-MathML/
1933 'math' => [
'class',
'style',
'id',
'title' ],
1936 'figure' => $common,
1937 'figcaption' => $common,
1939 # HTML 5 section 4.6
1942 # HTML5 elements, defined by:
1943 # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1944 'data' => array_merge( $common, [
'value' ] ),
1945 'time' => array_merge( $common, [
'datetime' ] ),
1953 'meta' => [
'itemprop',
'content' ],
1954 'link' => [
'itemprop',
'href',
'title' ],
1970 static function stripAllTags( $text ) {
1974 # Normalize &entities and whitespace
1975 $text = self::decodeCharReferences( $text );
1976 $text = self::normalizeWhitespace( $text );
1990 static function hackDocType() {
1991 $out =
"<!DOCTYPE html [\n";
1992 foreach ( self::$htmlEntities
as $entity => $codepoint ) {
1993 $out .=
"<!ENTITY $entity \"&#$codepoint;\">";
2003 static function cleanUrl( $url ) {
2004 # Normalize any HTML entities in input. They will be
2005 # re-escaped by makeExternalLink().
2006 $url = self::decodeCharReferences( $url );
2008 # Escape any control characters introduced by the above step
2009 $url = preg_replace_callback(
'/[\][<>"\\x00-\\x20\\x7F\|]/',
2010 [ __CLASS__,
'cleanUrlCallback' ], $url );
2012 # Validate hostname portion
2014 if ( preg_match(
'!^([^:]+:)(//[^/]+)?(.*)$!iD', $url,
$matches ) ) {
2021 \\s| # general whitespace
2022 \xc2\xad| # 00ad SOFT HYPHEN
2023 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
2024 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
2025 \xe2\x81\xa0| # 2060 WORD JOINER
2026 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
2027 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
2028 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
2029 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
2030 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
2031 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
2032 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
2033 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
2036 $host = preg_replace( $strip,
'', $host );
2039 if ( substr_compare(
"//%5B", $host, 0, 5 ) === 0 &&
2040 preg_match(
'!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host,
$matches )
2047 return $protocol . $host . $rest;
2057 static function cleanUrlCallback(
$matches ) {
2089 public static function validateEmail( $addr ) {
2098 $rfc5322_atext =
"a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
2099 $rfc1034_ldh_str =
"a-z0-9\\-";
2101 $html5_email_regexp =
"/
2103 [$rfc5322_atext\\.]+ # user part which is liberal :p
2105 [$rfc1034_ldh_str]+ # First domain part
2106 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
2110 return (
bool)preg_match( $html5_email_regexp, $addr );