36 const CHAR_REFS_REGEX =
37 '/&([A-Za-z0-9\x80-\xff]+);
39 |&\#[xX]([0-9A-Fa-f]+);
46 const ELEMENT_BITS_REGEX =
'!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
56 const EVIL_URI_PATTERN =
'!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
57 const XMLNS_ATTRIBUTE_PATTERN =
"/^xmlns:[:A-Z_a-z-.0-9]+$/";
72 const ID_FALLBACK = 1;
79 private static $htmlEntities = [
338 private static $htmlEntityAliases = [
346 private static $attribsRegex;
355 static function getAttribsRegex() {
356 if ( self::$attribsRegex ===
null ) {
357 $attribFirst =
"[:_\p{L}\p{N}]";
358 $attrib =
"[:_\.\-\p{L}\p{N}]";
359 $space =
'[\x09\x0a\x0c\x0d\x20]';
360 self::$attribsRegex =
361 "/(?:^|$space)({$attribFirst}{$attrib}*)
364 # The attribute value: quoted or alone
369 )?(?=$space|\$)/sxu";
371 return self::$attribsRegex;
380 public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
383 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
384 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
389 if ( !$staticInitialised || $staticInitialised != $globalContext ) {
390 $htmlpairsStatic = [ # Tags
that must be closed
391 'b',
'bdi',
'del',
'i',
'ins',
'u',
'font',
'big',
'small',
'sub',
'sup',
'h1',
392 'h2',
'h3',
'h4',
'h5',
'h6',
'cite',
'code',
'em',
's',
393 'strike',
'strong',
'tt',
'var',
'div',
'center',
394 'blockquote',
'ol',
'ul',
'dl',
'table',
'caption',
'pre',
395 'ruby',
'rb',
'rp',
'rt',
'rtc',
'p',
'span',
'abbr',
'dfn',
396 'kbd',
'samp',
'data',
'time',
'mark'
399 'br',
'wbr',
'hr',
'li',
'dt',
'dd',
'meta',
'link'
402 # Elements that cannot have close tags. This is (not coincidentally)
403 # also the list of tags for which the HTML 5 parsing algorithm
404 # requires you to "acknowledge the token's self-closing flag", i.e.
405 # a self-closing tag like <br/> is not an HTML 5 parse error only
408 'br',
'wbr',
'hr',
'meta',
'link'
411 $htmlnest = [ # Tags
that can be nested--??
412 'table',
'tr',
'td',
'th',
'div',
'blockquote',
'ol',
'ul',
413 'li',
'dl',
'dt',
'dd',
'font',
'big',
'small',
'sub',
'sup',
'span',
414 'var',
'kbd',
'samp',
'em',
'strong',
'q',
'ruby',
'bdo'
416 $tabletags = [ # Can only appear inside
table, we
will close
them
427 $htmlsingle[] =
'img';
428 $htmlsingleonly[] =
'img';
431 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
432 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
434 # Convert them all to hashtables for faster lookup
435 $vars = [
'htmlpairsStatic',
'htmlsingle',
'htmlsingleonly',
'htmlnest',
'tabletags',
436 'htmllist',
'listtags',
'htmlsingleallowed',
'htmlelementsStatic' ];
438 $$var = array_flip( $$var );
440 $staticInitialised = $globalContext;
443 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
444 $extratags = array_flip( $extratags );
445 $removetags = array_flip( $removetags );
446 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
447 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
450 'htmlpairs' => $htmlpairs,
451 'htmlsingle' => $htmlsingle,
452 'htmlsingleonly' => $htmlsingleonly,
453 'htmlnest' => $htmlnest,
454 'tabletags' => $tabletags,
455 'htmllist' => $htmllist,
456 'listtags' => $listtags,
457 'htmlsingleallowed' => $htmlsingleallowed,
458 'htmlelements' => $htmlelements,
477 public static function removeHTMLtags( $text, $processCallback =
null,
478 $args = [], $extratags = [], $removetags = [], $warnCallback =
null
480 extract( self::getRecognizedTagData( $extratags, $removetags ) );
482 # Remove HTML comments
483 $text = self::removeHTMLcomments( $text );
484 $bits = explode(
'<', $text );
485 $text = str_replace(
'>',
'>', array_shift( $bits ) );
487 $tagstack = $tablestack = [];
488 foreach ( $bits
as $x ) {
490 # $slash: Does the current element start with a '/'?
491 # $t: Current element name
492 # $params: String between element name and >
493 # $brace: Ending '>' or '/>'
494 # $rest: Everything until the next element of $bits
495 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
498 $slash =
$t =
$params = $brace = $rest =
null;
502 $t = strtolower(
$t );
503 if ( isset( $htmlelements[
$t] ) ) {
505 if ( $slash && isset( $htmlsingleonly[
$t] ) ) {
507 } elseif ( $slash ) {
508 # Closing a tag... is it the one we just opened?
509 MediaWiki\suppressWarnings();
510 $ot = array_pop( $tagstack );
511 MediaWiki\restoreWarnings();
514 if ( isset( $htmlsingleallowed[$ot] ) ) {
515 # Pop all elements with an optional close tag
516 # and see if we find a match below them
518 array_push( $optstack, $ot );
519 MediaWiki\suppressWarnings();
520 $ot = array_pop( $tagstack );
521 MediaWiki\restoreWarnings();
522 while ( $ot !=
$t && isset( $htmlsingleallowed[$ot] ) ) {
523 array_push( $optstack, $ot );
524 MediaWiki\suppressWarnings();
525 $ot = array_pop( $tagstack );
526 MediaWiki\restoreWarnings();
529 # No match. Push the optional elements back again
531 MediaWiki\suppressWarnings();
532 $ot = array_pop( $optstack );
533 MediaWiki\restoreWarnings();
535 array_push( $tagstack, $ot );
536 MediaWiki\suppressWarnings();
537 $ot = array_pop( $optstack );
538 MediaWiki\restoreWarnings();
542 MediaWiki\suppressWarnings();
543 array_push( $tagstack, $ot );
544 MediaWiki\restoreWarnings();
546 # <li> can be nested in <ul> or <ol>, skip those cases:
547 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[
$t] ) ) {
552 if (
$t ==
'table' ) {
553 $tagstack = array_pop( $tablestack );
558 # Keep track for later
559 if ( isset( $tabletags[
$t] ) && !in_array(
'table', $tagstack ) ) {
561 } elseif ( in_array(
$t, $tagstack ) && !isset( $htmlnest[
$t] ) ) {
563 # Is it a self closed htmlpair ? (T7487)
564 } elseif ( $brace ==
'/>' && isset( $htmlpairs[
$t] ) ) {
570 if ( is_callable( $warnCallback ) ) {
571 call_user_func_array( $warnCallback, [
'deprecated-self-close-category' ] );
574 } elseif ( isset( $htmlsingleonly[
$t] ) ) {
575 # Hack to force empty tag for unclosable elements
577 } elseif ( isset( $htmlsingle[
$t] ) ) {
578 # Hack to not close $htmlsingle tags
580 # Still need to push this optionally-closed tag to
581 # the tag stack so that we can match end tags
582 # instead of marking them as bad.
583 array_push( $tagstack,
$t );
584 } elseif ( isset( $tabletags[
$t] ) && in_array(
$t, $tagstack ) ) {
588 if (
$t ==
'table' ) {
589 array_push( $tablestack, $tagstack );
592 array_push( $tagstack,
$t );
595 # Replace any variables or template parameters with
597 if ( is_callable( $processCallback ) ) {
598 call_user_func_array( $processCallback, [ &
$params,
$args ] );
601 if ( !self::validateTag(
$params,
$t ) ) {
605 # Strip non-approved attributes from the tag
606 $newparams = self::fixTagAttributes(
$params,
$t );
609 $rest = str_replace(
'>',
'>', $rest );
610 $close = ( $brace ==
'/>' && !$slash ) ?
' /' :
'';
611 $text .=
"<$slash$t$newparams$close>$rest";
615 $text .=
'<' . str_replace(
'>',
'>', $x );
617 # Close off any remaining tags
618 while ( is_array( $tagstack ) && (
$t = array_pop( $tagstack ) ) ) {
620 if (
$t ==
'table' ) {
621 $tagstack = array_pop( $tablestack );
625 # this might be possible using tidy itself
626 foreach ( $bits
as $x ) {
627 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
631 $t = strtolower(
$t );
632 if ( isset( $htmlelements[
$t] ) ) {
633 if ( is_callable( $processCallback ) ) {
634 call_user_func_array( $processCallback, [ &
$params,
$args ] );
637 if ( $brace ==
'/>' && !( isset( $htmlsingle[
$t] ) || isset( $htmlsingleonly[
$t] ) ) ) {
643 if ( is_callable( $warnCallback ) ) {
644 call_user_func_array( $warnCallback, [
'deprecated-self-close-category' ] );
647 if ( !self::validateTag(
$params,
$t ) ) {
651 $newparams = self::fixTagAttributes(
$params,
$t );
653 if ( $brace ===
'/>' && !isset( $htmlsingleonly[
$t] ) ) {
654 # Interpret self-closing tags as empty tags even when
655 # HTML 5 would interpret them as start tags. Such input
656 # is commonly seen on Wikimedia wikis with this intention.
660 $rest = str_replace(
'>',
'>', $rest );
661 $text .=
"<$slash$t$newparams$brace$rest";
666 $text .=
'<' . str_replace(
'>',
'>', $x );
681 public static function removeHTMLcomments( $text ) {
682 while ( ( $start = strpos( $text,
'<!--' ) ) !==
false ) {
683 $end = strpos( $text,
'-->', $start + 4 );
684 if ( $end ===
false ) {
685 # Unterminated comment; bail out
691 # Trim space and newline if the comment is both
692 # preceded and followed by a newline
693 $spaceStart = max( $start - 1, 0 );
694 $spaceLen = $end - $spaceStart;
695 while ( substr( $text, $spaceStart, 1 ) ===
' ' && $spaceStart > 0 ) {
699 while ( substr( $text, $spaceStart + $spaceLen, 1 ) ===
' ' ) {
702 if ( substr( $text, $spaceStart, 1 ) ===
"\n"
703 && substr( $text, $spaceStart + $spaceLen, 1 ) ===
"\n" ) {
704 # Remove the comment, leading and trailing
705 # spaces, and leave only one newline.
706 $text = substr_replace( $text,
"\n", $spaceStart, $spaceLen + 1 );
708 # Remove just the comment.
709 $text = substr_replace( $text,
'', $start, $end - $start );
727 static function validateTag(
$params, $element ) {
730 if ( $element ==
'meta' || $element ==
'link' ) {
731 if ( !isset(
$params[
'itemprop'] ) ) {
735 if ( $element ==
'meta' && !isset(
$params[
'content'] ) ) {
739 if ( $element ==
'link' && !isset(
$params[
'href'] ) ) {
763 static function validateTagAttributes(
$attribs, $element ) {
764 return self::validateAttributes(
$attribs,
765 self::attributeWhitelist( $element ) );
783 static function validateAttributes(
$attribs, $whitelist ) {
784 $whitelist = array_flip( $whitelist );
789 # Allow XML namespace declaration to allow RDFa
790 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
791 if ( !preg_match( self::EVIL_URI_PATTERN,
$value ) ) {
798 # Allow any attribute beginning with "data-"
800 # * Disallow data attributes used by MediaWiki code
801 # * Ensure that the attribute is not namespaced by banning
803 if ( !preg_match(
'/^data-[^:]*$/i', $attribute )
804 && !isset( $whitelist[$attribute] )
805 || self::isReservedDataAttribute( $attribute )
810 # Strip javascript "expression" from stylesheets.
811 # https://msdn.microsoft.com/en-us/library/ms537634.aspx
812 if ( $attribute ==
'style' ) {
816 # Escape HTML id attributes
817 if ( $attribute ===
'id' ) {
818 $value = self::escapeIdForAttribute(
$value, self::ID_PRIMARY );
821 # Escape HTML id reference lists
822 if ( $attribute ===
'aria-describedby'
823 || $attribute ===
'aria-flowto'
824 || $attribute ===
'aria-labelledby'
825 || $attribute ===
'aria-owns'
827 $value = self::escapeIdReferenceList(
$value,
'noninitial' );
832 if ( $attribute ===
'rel' || $attribute ===
'rev'
834 || $attribute ===
'about' || $attribute ===
'property'
835 || $attribute ===
'resource' || $attribute ===
'datatype'
836 || $attribute ===
'typeof'
838 || $attribute ===
'itemid' || $attribute ===
'itemprop'
839 || $attribute ===
'itemref' || $attribute ===
'itemscope'
840 || $attribute ===
'itemtype'
843 if ( preg_match( self::EVIL_URI_PATTERN,
$value ) ) {
848 # NOTE: even though elements using href/src are not allowed directly, supply
849 # validation code that can be used by tag hook handlers, etc
850 if ( $attribute ===
'href' || $attribute ===
'src' || $attribute ===
'poster' ) {
851 if ( !preg_match( $hrefExp,
$value ) ) {
862 # itemtype, itemid, itemref don't make sense without itemscope
863 if ( !array_key_exists(
'itemscope',
$out ) ) {
864 unset(
$out[
'itemtype'] );
865 unset(
$out[
'itemid'] );
866 unset(
$out[
'itemref'] );
868 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
880 public static function isReservedDataAttribute( $attr ) {
888 return (
bool)preg_match(
'/^data-(ooui|mw|parsoid)/i', $attr );
901 static function mergeAttributes( $a, $b ) {
902 $out = array_merge( $a, $b );
903 if ( isset( $a[
'class'] ) && isset( $b[
'class'] )
904 && is_string( $a[
'class'] ) && is_string( $b[
'class'] )
905 && $a[
'class'] !== $b[
'class']
907 $classes = preg_split(
'/\s+/',
"{$a['class']} {$b['class']}",
908 -1, PREG_SPLIT_NO_EMPTY );
909 $out[
'class'] = implode(
' ', array_unique( $classes ) );
923 public static function normalizeCss(
$value ) {
937 if ( !$decodeRegex ) {
938 $space =
'[\\x20\\t\\r\\n\\f]';
939 $nl =
'(?:\\n|\\r\\n|\\r|\\f)';
941 $decodeRegex =
"/ $backslash
943 ($nl) | # 1. Line continuation
944 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
945 (.) | # 3. backslash cancelling special meaning
946 () | # 4. backslash at end of string
949 $value = preg_replace_callback( $decodeRegex,
950 [ __CLASS__,
'cssDecodeCallback' ],
$value );
953 $value = preg_replace_callback(
957 if ( $cp ===
false ) {
960 return chr( $cp - 65248 );
968 [
'ʀ',
'ɴ',
'ⁿ',
'ʟ',
'ɪ',
'⁽',
'₍' ],
969 [
'r',
'n',
'n',
'l',
'i',
'(',
'(' ],
976 if ( !preg_match(
'! ^ \s* /\* [^*\\/]* \*/ \s* $ !x',
$value ) ) {
987 $commentPos = strpos(
$value,
'/*' );
988 if ( $commentPos !==
false ) {
997 \xE3\x80\xB1 | # U+3031
998 \xE3\x82\x9D | # U+309D
999 \xE3\x83\xBC | # U+30FC
1000 \xE3\x83\xBD | # U+30FD
1001 \xEF\xB9\xBC | # U+FE7C
1002 \xEF\xB9\xBD | # U+FE7D
1003 \xEF\xBD\xB0 # U+FF70
1030 static function checkCss(
$value ) {
1034 if ( preg_match(
'/[\000-\010\013\016-\037\177]/',
$value ) ||
1036 return '/* invalid control char */';
1037 } elseif ( preg_match(
1042 | -o-link-source\s*:
1047 | attr\s*\([^)]+[\s,]+url
1050 return '/* insecure input */';
1059 static function cssDecodeCallback(
$matches ) {
1070 if ( $char ==
"\n" || $char ==
'"' || $char ==
"'" || $char ==
'\\' ) {
1073 return '\\' . dechex( ord( $char ) ) .
' ';
1101 static function fixTagAttributes( $text, $element, $sorted =
false ) {
1102 if ( trim( $text ) ==
'' ) {
1106 $decoded = self::decodeTagAttributes( $text );
1107 $stripped = self::validateTagAttributes( $decoded, $element );
1113 return self::safeEncodeTagAttributes( $stripped );
1121 static function encodeAttribute( $text ) {
1122 $encValue = htmlspecialchars( $text, ENT_QUOTES );
1127 $encValue = strtr( $encValue, [
1142 static function safeEncodeAttribute( $text ) {
1143 $encValue = self::encodeAttribute( $text );
1145 # Templates and links may be expanded in later parsing,
1146 # creating invalid or dangerous output. Suppress this.
1147 $encValue = strtr( $encValue, [
1154 "''" =>
'''',
1155 'ISBN' =>
'ISBN',
1157 'PMID' =>
'PMID',
1163 $encValue = preg_replace_callback(
1165 [
'Sanitizer',
'armorLinksCallback' ],
1203 static function escapeId( $id,
$options = [] ) {
1208 $id = preg_replace(
'/[ \t\n\r\f_\'"&#%]+/',
'_', $id );
1209 $id = trim( $id,
'_' );
1224 $id = urlencode( strtr( $id,
' ',
'_' ) );
1225 $id = strtr( $id, $replace );
1227 if ( !preg_match(
'/^[a-zA-Z]/', $id ) && !in_array(
'noninitial',
$options ) ) {
1249 public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
1253 if ( $mode === self::ID_PRIMARY ) {
1254 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
1261 return self::escapeIdInternal( $id, $internalMode );
1276 public static function escapeIdForLink( $id ) {
1280 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
1285 $id = self::escapeIdInternal( $id, $mode );
1299 public static function escapeIdForExternalInterwiki( $id ) {
1314 private static function escapeIdInternal( $id, $mode ) {
1317 $id = str_replace(
' ',
'_', $id );
1326 $id = urlencode( str_replace(
' ',
'_', $id ) );
1327 $id = strtr( $id, $replace );
1329 case 'html5-legacy':
1330 $id = preg_replace(
'/[ \t\n\r\f_\'"&#%]+/',
'_', $id );
1331 $id = trim( $id,
'_' );
1338 throw new InvalidArgumentException(
"Invalid mode '$mode' passed to '" . __METHOD__ );
1356 static function escapeIdReferenceList( $referenceString,
$options = [] ) {
1357 # Explode the space delimited list string into an array of tokens
1358 $references = preg_split(
'/\s+/',
"{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1360 # Escape each token as an id
1361 foreach ( $references
as &$ref ) {
1362 $ref = self::escapeIdForAttribute( $ref );
1365 # Merge the array back to a space delimited list string
1366 # If the array is empty, the result will be an empty string ('')
1367 $referenceString = implode(
' ', $references );
1369 return $referenceString;
1383 static function escapeClass( $class ) {
1385 return rtrim( preg_replace(
1386 [
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/' ],
1398 static function escapeHtmlAllowEntities(
$html ) {
1400 # It seems wise to escape ' as well as ", as a matter of course. Can't
1401 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1402 # don't cause the entire string to disappear.
1403 $html = htmlspecialchars(
$html, ENT_QUOTES | ENT_SUBSTITUTE );
1412 private static function armorLinksCallback(
$matches ) {
1413 return str_replace(
':',
':',
$matches[1] );
1424 public static function decodeTagAttributes( $text ) {
1425 if ( trim( $text ) ==
'' ) {
1431 if ( !preg_match_all(
1432 self::getAttribsRegex(),
1435 PREG_SET_ORDER ) ) {
1439 foreach ( $pairs
as $set ) {
1440 $attribute = strtolower( $set[1] );
1441 $value = self::getTagAttributeCallback( $set );
1460 public static function safeEncodeTagAttributes( $assoc_array ) {
1462 foreach ( $assoc_array
as $attribute =>
$value ) {
1463 $encAttribute = htmlspecialchars( $attribute );
1464 $encValue = self::safeEncodeAttribute(
$value );
1466 $attribs[] =
"$encAttribute=\"$encValue\"";
1479 private static function getTagAttributeCallback( $set ) {
1480 if ( isset( $set[5] ) ) {
1483 } elseif ( isset( $set[4] ) ) {
1486 } elseif ( isset( $set[3] ) ) {
1489 } elseif ( !isset( $set[2] ) ) {
1490 # In XHTML, attributes must have a value so return an empty string.
1491 # See "Empty attribute syntax",
1492 # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1495 throw new MWException(
"Tag conditions not met. This should never happen and is a bug." );
1503 private static function normalizeWhitespace( $text ) {
1504 return preg_replace(
1505 '/\r\n|[\x20\x0d\x0a\x09]/',
1518 static function normalizeSectionNameWhitespace(
$section ) {
1519 return trim( preg_replace(
'/[ _]+/',
' ',
$section ) );
1537 static function normalizeCharReferences( $text ) {
1538 return preg_replace_callback(
1539 self::CHAR_REFS_REGEX,
1540 [
'Sanitizer',
'normalizeCharReferencesCallback' ],
1548 static function normalizeCharReferencesCallback(
$matches ) {
1557 if ( is_null(
$ret ) ) {
1558 return htmlspecialchars(
$matches[0] );
1574 static function normalizeEntity(
$name ) {
1575 if ( isset( self::$htmlEntityAliases[
$name] ) ) {
1576 return '&' . self::$htmlEntityAliases[
$name] .
';';
1577 } elseif ( in_array(
$name, [
'lt',
'gt',
'amp',
'quot' ] ) ) {
1579 } elseif ( isset( self::$htmlEntities[
$name] ) ) {
1580 return '&#' . self::$htmlEntities[
$name] .
';';
1582 return "&$name;";
1590 static function decCharReference( $codepoint ) {
1591 $point = intval( $codepoint );
1592 if ( self::validateCodepoint( $point ) ) {
1593 return sprintf(
'&#%d;', $point );
1603 static function hexCharReference( $codepoint ) {
1604 $point = hexdec( $codepoint );
1605 if ( self::validateCodepoint( $point ) ) {
1606 return sprintf(
'&#x%x;', $point );
1618 private static function validateCodepoint( $codepoint ) {
1619 # U+000C is valid in HTML5 but not allowed in XML.
1620 # U+000D is valid in XML but not allowed in HTML5.
1621 # U+007F - U+009F are disallowed in HTML5 (control characters).
1622 return $codepoint == 0x09
1623 || $codepoint == 0x0a
1624 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1625 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1626 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1627 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1637 public static function decodeCharReferences( $text ) {
1638 return preg_replace_callback(
1639 self::CHAR_REFS_REGEX,
1640 [
'Sanitizer',
'decodeCharReferencesCallback' ],
1654 public static function decodeCharReferencesAndNormalize( $text ) {
1656 $text = preg_replace_callback(
1657 self::CHAR_REFS_REGEX,
1658 [
'Sanitizer',
'decodeCharReferencesCallback' ],
1675 static function decodeCharReferencesCallback(
$matches ) {
1677 return self::decodeEntity(
$matches[1] );
1679 return self::decodeChar( intval(
$matches[2] ) );
1681 return self::decodeChar( hexdec(
$matches[3] ) );
1683 # Last case should be an ampersand by itself
1694 static function decodeChar( $codepoint ) {
1695 if ( self::validateCodepoint( $codepoint ) ) {
1710 static function decodeEntity(
$name ) {
1711 if ( isset( self::$htmlEntityAliases[
$name] ) ) {
1714 if ( isset( self::$htmlEntities[
$name] ) ) {
1727 static function attributeWhitelist( $element ) {
1728 $list = self::setupAttributeWhitelist();
1729 return isset( $list[$element] )
1739 static function setupAttributeWhitelist() {
1742 if ( $whitelist !==
null ) {
1764 # These attributes are specified in section 9 of
1765 # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1772 # Microdata. These are specified by
1773 # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1781 $block = array_merge( $common, [
'align' ] );
1782 $tablealign = [
'align',
'valign' ];
1790 'nowrap', # deprecated
1791 'width', # deprecated
1792 'height', # deprecated
1793 'bgcolor', # deprecated
1796 # Numbers refer to sections in HTML 4.01 standard describing the element.
1797 # See: https://www.w3.org/TR/html4/
1801 'center' => $common, # deprecated
1820 'strong' => $common,
1831 'blockquote' => array_merge( $common, [
'cite' ] ),
1832 'q' => array_merge( $common, [
'cite' ] ),
1842 'br' => array_merge( $common, [
'clear' ] ),
1844 # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1848 'pre' => array_merge( $common, [
'width' ] ),
1851 'ins' => array_merge( $common, [
'cite',
'datetime' ] ),
1852 'del' => array_merge( $common, [
'cite',
'datetime' ] ),
1855 'ul' => array_merge( $common, [
'type' ] ),
1856 'ol' => array_merge( $common, [
'type',
'start',
'reversed' ] ),
1857 'li' => array_merge( $common, [
'type',
'value' ] ),
1865 'table' => array_merge( $common,
1866 [
'summary',
'width',
'border',
'frame',
1867 'rules',
'cellspacing',
'cellpadding',
1872 'caption' => $block,
1880 'colgroup' => array_merge( $common, [
'span' ] ),
1881 'col' => array_merge( $common, [
'span' ] ),
1884 'tr' => array_merge( $common, [
'bgcolor' ], $tablealign ),
1887 'td' => array_merge( $common, $tablecell, $tablealign ),
1888 'th' => array_merge( $common, $tablecell, $tablealign ),
1891 # NOTE: <a> is not allowed directly, but the attrib
1892 # whitelist is used from the Parser object
1893 'a' => array_merge( $common, [
'href',
'rel',
'rev' ] ), # rel/rev esp.
for RDFa
1896 # Not usually allowed, but may be used for extension-style hooks
1897 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1899 'img' => array_merge( $common, [
'alt',
'src',
'width',
'height',
'srcset' ] ),
1901 'video' => array_merge( $common, [
'poster',
'controls',
'preload',
'width',
'height' ] ),
1902 'source' => array_merge( $common, [
'type',
'src' ] ),
1903 'track' => array_merge( $common, [
'type',
'src',
'srclang',
'kind',
'label' ] ),
1911 'strike' => $common,
1916 'font' => array_merge( $common, [
'size',
'color',
'face' ] ),
1920 'hr' => array_merge( $common, [
'width' ] ),
1922 # HTML Ruby annotation text module, simple ruby only.
1923 # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1928 'rt' => $common, # array_merge( $common,
array(
'rbspan' ) ),
1931 # MathML root element, where used for extensions
1932 # 'title' may not be 100% valid here; it's XHTML
1933 # https://www.w3.org/TR/REC-MathML/
1934 'math' => [
'class',
'style',
'id',
'title' ],
1937 'figure' => $common,
1938 'figcaption' => $common,
1940 # HTML 5 section 4.6
1943 # HTML5 elements, defined by:
1944 # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1945 'data' => array_merge( $common, [
'value' ] ),
1946 'time' => array_merge( $common, [
'datetime' ] ),
1954 'meta' => [
'itemprop',
'content' ],
1955 'link' => [
'itemprop',
'href',
'title' ],
1971 static function stripAllTags( $text ) {
1975 # Normalize &entities and whitespace
1976 $text = self::decodeCharReferences( $text );
1977 $text = self::normalizeWhitespace( $text );
1991 static function hackDocType() {
1992 $out =
"<!DOCTYPE html [\n";
1993 foreach ( self::$htmlEntities
as $entity => $codepoint ) {
1994 $out .=
"<!ENTITY $entity \"&#$codepoint;\">";
2004 static function cleanUrl( $url ) {
2005 # Normalize any HTML entities in input. They will be
2006 # re-escaped by makeExternalLink().
2007 $url = self::decodeCharReferences( $url );
2009 # Escape any control characters introduced by the above step
2010 $url = preg_replace_callback(
'/[\][<>"\\x00-\\x20\\x7F\|]/',
2011 [ __CLASS__,
'cleanUrlCallback' ], $url );
2013 # Validate hostname portion
2015 if ( preg_match(
'!^([^:]+:)(//[^/]+)?(.*)$!iD', $url,
$matches ) ) {
2022 \\s| # general whitespace
2023 \xc2\xad| # 00ad SOFT HYPHEN
2024 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
2025 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
2026 \xe2\x81\xa0| # 2060 WORD JOINER
2027 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
2028 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
2029 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
2030 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
2031 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
2032 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
2033 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
2034 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
2037 $host = preg_replace( $strip,
'', $host );
2040 if ( substr_compare(
"//%5B", $host, 0, 5 ) === 0 &&
2041 preg_match(
'!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host,
$matches )
2048 return $protocol . $host . $rest;
2058 static function cleanUrlCallback(
$matches ) {
2090 public static function validateEmail( $addr ) {
2099 $rfc5322_atext =
"a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
2100 $rfc1034_ldh_str =
"a-z0-9\\-";
2102 $html5_email_regexp =
"/
2104 [$rfc5322_atext\\.]+ # user part which is liberal :p
2106 [$rfc1034_ldh_str]+ # First domain part
2107 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
2111 return (
bool)preg_match( $html5_email_regexp, $addr );