36 const CHAR_REFS_REGEX =
37 '/&([A-Za-z0-9\x80-\xff]+);
39 |&\#[xX]([0-9A-Fa-f]+);
46 const ELEMENT_BITS_REGEX =
'!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
56 const EVIL_URI_PATTERN =
'!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
57 const XMLNS_ATTRIBUTE_PATTERN =
"/^xmlns:[:A-Z_a-z-.0-9]+$/";
72 const ID_FALLBACK = 1;
79 private static $htmlEntities = [
338 private static $htmlEntityAliases = [
346 private static $attribsRegex;
355 static function getAttribsRegex() {
356 if ( self::$attribsRegex ===
null ) {
357 $attribFirst =
"[:_\p{L}\p{N}]";
358 $attrib =
"[:_\.\-\p{L}\p{N}]";
359 $space =
'[\x09\x0a\x0c\x0d\x20]';
360 self::$attribsRegex =
361 "/(?:^|$space)({$attribFirst}{$attrib}*)
364 # The attribute value: quoted or alone
369 )?(?=$space|\$)/sxu";
371 return self::$attribsRegex;
380 public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
383 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
384 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
389 if ( !$staticInitialised || $staticInitialised != $globalContext ) {
390 $htmlpairsStatic = [ # Tags
that must be closed
391 'b',
'bdi',
'del',
'i',
'ins',
'u',
'font',
'big',
'small',
'sub',
'sup',
'h1',
392 'h2',
'h3',
'h4',
'h5',
'h6',
'cite',
'code',
'em',
's',
393 'strike',
'strong',
'tt',
'var',
'div',
'center',
394 'blockquote',
'ol',
'ul',
'dl',
'table',
'caption',
'pre',
395 'ruby',
'rb',
'rp',
'rt',
'rtc',
'p',
'span',
'abbr',
'dfn',
396 'kbd',
'samp',
'data',
'time',
'mark'
399 'br',
'wbr',
'hr',
'li',
'dt',
'dd',
'meta',
'link'
402 # Elements that cannot have close tags. This is (not coincidentally)
403 # also the list of tags for which the HTML 5 parsing algorithm
404 # requires you to "acknowledge the token's self-closing flag", i.e.
405 # a self-closing tag like <br/> is not an HTML 5 parse error only
408 'br',
'wbr',
'hr',
'meta',
'link'
411 $htmlnest = [ # Tags
that can be nested--??
412 'table',
'tr',
'td',
'th',
'div',
'blockquote',
'ol',
'ul',
413 'li',
'dl',
'dt',
'dd',
'font',
'big',
'small',
'sub',
'sup',
'span',
414 'var',
'kbd',
'samp',
'em',
'strong',
'q',
'ruby',
'bdo'
416 $tabletags = [ # Can only appear inside
table, we
will close
them
427 $htmlsingle[] =
'img';
428 $htmlsingleonly[] =
'img';
431 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
432 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
434 # Convert them all to hashtables for faster lookup
435 $vars = [
'htmlpairsStatic',
'htmlsingle',
'htmlsingleonly',
'htmlnest',
'tabletags',
436 'htmllist',
'listtags',
'htmlsingleallowed',
'htmlelementsStatic' ];
438 $$var = array_flip( $$var );
440 $staticInitialised = $globalContext;
443 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
444 $extratags = array_flip( $extratags );
445 $removetags = array_flip( $removetags );
446 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
447 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
450 'htmlpairs' => $htmlpairs,
451 'htmlsingle' => $htmlsingle,
452 'htmlsingleonly' => $htmlsingleonly,
453 'htmlnest' => $htmlnest,
454 'tabletags' => $tabletags,
455 'htmllist' => $htmllist,
456 'listtags' => $listtags,
457 'htmlsingleallowed' => $htmlsingleallowed,
458 'htmlelements' => $htmlelements,
477 public static function removeHTMLtags( $text, $processCallback =
null,
478 $args = [], $extratags = [], $removetags = [], $warnCallback =
null
480 $tagData = self::getRecognizedTagData( $extratags, $removetags );
481 $htmlpairs = $tagData[
'htmlpairs'];
482 $htmlsingle = $tagData[
'htmlsingle'];
483 $htmlsingleonly = $tagData[
'htmlsingleonly'];
484 $htmlnest = $tagData[
'htmlnest'];
485 $tabletags = $tagData[
'tabletags'];
486 $htmllist = $tagData[
'htmllist'];
487 $listtags = $tagData[
'listtags'];
488 $htmlsingleallowed = $tagData[
'htmlsingleallowed'];
489 $htmlelements = $tagData[
'htmlelements'];
491 # Remove HTML comments
492 $text = self::removeHTMLcomments( $text );
493 $bits = explode(
'<', $text );
494 $text = str_replace(
'>',
'>', array_shift( $bits ) );
496 $tagstack = $tablestack = [];
497 foreach ( $bits
as $x ) {
499 # $slash: Does the current element start with a '/'?
500 # $t: Current element name
501 # $params: String between element name and >
502 # $brace: Ending '>' or '/>'
503 # $rest: Everything until the next element of $bits
504 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
507 $slash =
$t =
$params = $brace = $rest =
null;
511 $t = strtolower(
$t );
512 if ( isset( $htmlelements[
$t] ) ) {
514 if ( $slash && isset( $htmlsingleonly[
$t] ) ) {
516 } elseif ( $slash ) {
517 # Closing a tag... is it the one we just opened?
518 Wikimedia\suppressWarnings();
519 $ot = array_pop( $tagstack );
520 Wikimedia\restoreWarnings();
523 if ( isset( $htmlsingleallowed[$ot] ) ) {
524 # Pop all elements with an optional close tag
525 # and see if we find a match below them
527 array_push( $optstack, $ot );
528 Wikimedia\suppressWarnings();
529 $ot = array_pop( $tagstack );
530 Wikimedia\restoreWarnings();
531 while ( $ot !=
$t && isset( $htmlsingleallowed[$ot] ) ) {
532 array_push( $optstack, $ot );
533 Wikimedia\suppressWarnings();
534 $ot = array_pop( $tagstack );
535 Wikimedia\restoreWarnings();
538 # No match. Push the optional elements back again
540 Wikimedia\suppressWarnings();
541 $ot = array_pop( $optstack );
542 Wikimedia\restoreWarnings();
544 array_push( $tagstack, $ot );
545 Wikimedia\suppressWarnings();
546 $ot = array_pop( $optstack );
547 Wikimedia\restoreWarnings();
551 Wikimedia\suppressWarnings();
552 array_push( $tagstack, $ot );
553 Wikimedia\restoreWarnings();
555 # <li> can be nested in <ul> or <ol>, skip those cases:
556 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[
$t] ) ) {
561 if (
$t ==
'table' ) {
562 $tagstack = array_pop( $tablestack );
567 # Keep track for later
568 if ( isset( $tabletags[
$t] ) && !in_array(
'table', $tagstack ) ) {
570 } elseif ( in_array(
$t, $tagstack ) && !isset( $htmlnest[
$t] ) ) {
572 # Is it a self closed htmlpair ? (T7487)
573 } elseif ( $brace ==
'/>' && isset( $htmlpairs[
$t] ) ) {
579 if ( is_callable( $warnCallback ) ) {
580 call_user_func_array( $warnCallback, [
'deprecated-self-close-category' ] );
583 } elseif ( isset( $htmlsingleonly[
$t] ) ) {
584 # Hack to force empty tag for unclosable elements
586 } elseif ( isset( $htmlsingle[
$t] ) ) {
587 # Hack to not close $htmlsingle tags
589 # Still need to push this optionally-closed tag to
590 # the tag stack so that we can match end tags
591 # instead of marking them as bad.
592 array_push( $tagstack,
$t );
593 } elseif ( isset( $tabletags[
$t] ) && in_array(
$t, $tagstack ) ) {
597 if (
$t ==
'table' ) {
598 array_push( $tablestack, $tagstack );
601 array_push( $tagstack,
$t );
604 # Replace any variables or template parameters with
606 if ( is_callable( $processCallback ) ) {
607 call_user_func_array( $processCallback, [ &
$params,
$args ] );
610 if ( !self::validateTag(
$params,
$t ) ) {
614 # Strip non-approved attributes from the tag
615 $newparams = self::fixTagAttributes(
$params,
$t );
618 $rest = str_replace(
'>',
'>', $rest );
619 $close = ( $brace ==
'/>' && !$slash ) ?
' /' :
'';
620 $text .=
"<$slash$t$newparams$close>$rest";
624 $text .=
'<' . str_replace(
'>',
'>', $x );
626 # Close off any remaining tags
627 while ( is_array( $tagstack ) && (
$t = array_pop( $tagstack ) ) ) {
629 if (
$t ==
'table' ) {
630 $tagstack = array_pop( $tablestack );
634 # this might be possible using tidy itself
635 foreach ( $bits
as $x ) {
636 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
640 $t = strtolower(
$t );
641 if ( isset( $htmlelements[
$t] ) ) {
642 if ( is_callable( $processCallback ) ) {
643 call_user_func_array( $processCallback, [ &
$params,
$args ] );
646 if ( $brace ==
'/>' && !( isset( $htmlsingle[
$t] ) || isset( $htmlsingleonly[
$t] ) ) ) {
652 if ( is_callable( $warnCallback ) ) {
653 call_user_func_array( $warnCallback, [
'deprecated-self-close-category' ] );
656 if ( !self::validateTag(
$params,
$t ) ) {
660 $newparams = self::fixTagAttributes(
$params,
$t );
662 if ( $brace ===
'/>' && !isset( $htmlsingleonly[
$t] ) ) {
663 # Interpret self-closing tags as empty tags even when
664 # HTML 5 would interpret them as start tags. Such input
665 # is commonly seen on Wikimedia wikis with this intention.
669 $rest = str_replace(
'>',
'>', $rest );
670 $text .=
"<$slash$t$newparams$brace$rest";
675 $text .=
'<' . str_replace(
'>',
'>', $x );
690 public static function removeHTMLcomments( $text ) {
691 while ( ( $start = strpos( $text,
'<!--' ) ) !==
false ) {
692 $end = strpos( $text,
'-->', $start + 4 );
693 if ( $end ===
false ) {
694 # Unterminated comment; bail out
700 # Trim space and newline if the comment is both
701 # preceded and followed by a newline
702 $spaceStart = max( $start - 1, 0 );
703 $spaceLen = $end - $spaceStart;
704 while ( substr( $text, $spaceStart, 1 ) ===
' ' && $spaceStart > 0 ) {
708 while ( substr( $text, $spaceStart + $spaceLen, 1 ) ===
' ' ) {
711 if ( substr( $text, $spaceStart, 1 ) ===
"\n"
712 && substr( $text, $spaceStart + $spaceLen, 1 ) ===
"\n" ) {
713 # Remove the comment, leading and trailing
714 # spaces, and leave only one newline.
715 $text = substr_replace( $text,
"\n", $spaceStart, $spaceLen + 1 );
717 # Remove just the comment.
718 $text = substr_replace( $text,
'', $start, $end - $start );
736 static function validateTag(
$params, $element ) {
739 if ( $element ==
'meta' || $element ==
'link' ) {
740 if ( !isset(
$params[
'itemprop'] ) ) {
744 if ( $element ==
'meta' && !isset(
$params[
'content'] ) ) {
748 if ( $element ==
'link' && !isset(
$params[
'href'] ) ) {
772 static function validateTagAttributes(
$attribs, $element ) {
773 return self::validateAttributes(
$attribs,
774 self::attributeWhitelist( $element ) );
792 static function validateAttributes(
$attribs, $whitelist ) {
793 $whitelist = array_flip( $whitelist );
798 # Allow XML namespace declaration to allow RDFa
799 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
800 if ( !preg_match( self::EVIL_URI_PATTERN,
$value ) ) {
807 # Allow any attribute beginning with "data-"
809 # * Disallow data attributes used by MediaWiki code
810 # * Ensure that the attribute is not namespaced by banning
812 if ( !preg_match(
'/^data-[^:]*$/i', $attribute )
813 && !isset( $whitelist[$attribute] )
814 || self::isReservedDataAttribute( $attribute )
819 # Strip javascript "expression" from stylesheets.
820 # https://msdn.microsoft.com/en-us/library/ms537634.aspx
821 if ( $attribute ==
'style' ) {
825 # Escape HTML id attributes
826 if ( $attribute ===
'id' ) {
827 $value = self::escapeIdForAttribute(
$value, self::ID_PRIMARY );
830 # Escape HTML id reference lists
831 if ( $attribute ===
'aria-describedby'
832 || $attribute ===
'aria-flowto'
833 || $attribute ===
'aria-labelledby'
834 || $attribute ===
'aria-owns'
841 if ( $attribute ===
'rel' || $attribute ===
'rev'
843 || $attribute ===
'about' || $attribute ===
'property'
844 || $attribute ===
'resource' || $attribute ===
'datatype'
845 || $attribute ===
'typeof'
847 || $attribute ===
'itemid' || $attribute ===
'itemprop'
848 || $attribute ===
'itemref' || $attribute ===
'itemscope'
849 || $attribute ===
'itemtype'
852 if ( preg_match( self::EVIL_URI_PATTERN,
$value ) ) {
857 # NOTE: even though elements using href/src are not allowed directly, supply
858 # validation code that can be used by tag hook handlers, etc
859 if ( $attribute ===
'href' || $attribute ===
'src' || $attribute ===
'poster' ) {
860 if ( !preg_match( $hrefExp,
$value ) ) {
871 # itemtype, itemid, itemref don't make sense without itemscope
872 if ( !array_key_exists(
'itemscope',
$out ) ) {
873 unset(
$out[
'itemtype'] );
874 unset(
$out[
'itemid'] );
875 unset(
$out[
'itemref'] );
877 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
889 public static function isReservedDataAttribute( $attr ) {
897 return (
bool)preg_match(
'/^data-(ooui|mw|parsoid)/i', $attr );
910 static function mergeAttributes( $a, $b ) {
911 $out = array_merge( $a, $b );
912 if ( isset( $a[
'class'] ) && isset( $b[
'class'] )
913 && is_string( $a[
'class'] ) && is_string( $b[
'class'] )
914 && $a[
'class'] !== $b[
'class']
916 $classes = preg_split(
'/\s+/',
"{$a['class']} {$b['class']}",
917 -1, PREG_SPLIT_NO_EMPTY );
918 $out[
'class'] = implode(
' ', array_unique( $classes ) );
932 public static function normalizeCss(
$value ) {
946 if ( !$decodeRegex ) {
947 $space =
'[\\x20\\t\\r\\n\\f]';
948 $nl =
'(?:\\n|\\r\\n|\\r|\\f)';
950 $decodeRegex =
"/ $backslash
952 ($nl) | # 1. Line continuation
953 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
954 (.) | # 3. backslash cancelling special meaning
955 () | # 4. backslash at end of string
958 $value = preg_replace_callback( $decodeRegex,
959 [ __CLASS__,
'cssDecodeCallback' ],
$value );
962 $value = preg_replace_callback(
966 if ( $cp ===
false ) {
969 return chr( $cp - 65248 );
977 [
'ʀ',
'ɴ',
'ⁿ',
'ʟ',
'ɪ',
'⁽',
'₍' ],
978 [
'r',
'n',
'n',
'l',
'i',
'(',
'(' ],
985 if ( !preg_match(
'! ^ \s* /\* [^*\\/]* \*/ \s* $ !x',
$value ) ) {
996 $commentPos = strpos(
$value,
'/*' );
997 if ( $commentPos !==
false ) {
1006 \xE3\x80\xB1 | # U+3031
1007 \xE3\x82\x9D | # U+309D
1008 \xE3\x83\xBC | # U+30FC
1009 \xE3\x83\xBD | # U+30FD
1010 \xEF\xB9\xBC | # U+FE7C
1011 \xEF\xB9\xBD | # U+FE7D
1012 \xEF\xBD\xB0 # U+FF70
1039 static function checkCss(
$value ) {
1043 if ( preg_match(
'/[\000-\010\013\016-\037\177]/',
$value ) ||
1045 return '/* invalid control char */';
1046 } elseif ( preg_match(
1051 | -o-link-source\s*:
1056 | attr\s*\([^)]+[\s,]+url
1058 return '/* insecure input */';
1067 static function cssDecodeCallback(
$matches ) {
1078 if ( $char ==
"\n" || $char ==
'"' || $char ==
"'" || $char ==
'\\' ) {
1081 return '\\' . dechex( ord( $char ) ) .
' ';
1109 static function fixTagAttributes( $text, $element, $sorted =
false ) {
1110 if ( trim( $text ) ==
'' ) {
1114 $decoded = self::decodeTagAttributes( $text );
1115 $stripped = self::validateTagAttributes( $decoded, $element );
1121 return self::safeEncodeTagAttributes( $stripped );
1129 static function encodeAttribute( $text ) {
1130 $encValue = htmlspecialchars( $text, ENT_QUOTES );
1135 $encValue = strtr( $encValue, [
1150 static function safeEncodeAttribute( $text ) {
1151 $encValue = self::encodeAttribute( $text );
1153 # Templates and links may be expanded in later parsing,
1154 # creating invalid or dangerous output. Suppress this.
1155 $encValue = strtr( $encValue, [
1163 "''" =>
'''',
1164 'ISBN' =>
'ISBN',
1166 'PMID' =>
'PMID',
1172 $encValue = preg_replace_callback(
1175 return str_replace(
':',
':',
$matches[1] );
1214 static function escapeId( $id,
$options = [] ) {
1219 $id = preg_replace(
'/[ \t\n\r\f_\'"&#%]+/',
'_', $id );
1220 $id = trim( $id,
'_' );
1235 $id = urlencode( strtr( $id,
' ',
'_' ) );
1236 $id = strtr( $id, $replace );
1238 if ( !preg_match(
'/^[a-zA-Z]/', $id ) && !in_array(
'noninitial',
$options ) ) {
1260 public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
1264 if ( $mode === self::ID_PRIMARY ) {
1265 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
1272 return self::escapeIdInternal( $id, $internalMode );
1287 public static function escapeIdForLink( $id ) {
1291 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
1296 $id = self::escapeIdInternal( $id, $mode );
1310 public static function escapeIdForExternalInterwiki( $id ) {
1325 private static function escapeIdInternal( $id, $mode ) {
1328 $id = str_replace(
' ',
'_', $id );
1337 $id = urlencode( str_replace(
' ',
'_', $id ) );
1338 $id = strtr( $id, $replace );
1340 case 'html5-legacy':
1341 $id = preg_replace(
'/[ \t\n\r\f_\'"&#%]+/',
'_', $id );
1342 $id = trim( $id,
'_' );
1349 throw new InvalidArgumentException(
"Invalid mode '$mode' passed to '" . __METHOD__ );
1367 static function escapeIdReferenceList( $referenceString,
$options = [] ) {
1371 # Explode the space delimited list string into an array of tokens
1372 $references = preg_split(
'/\s+/',
"{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1374 # Escape each token as an id
1375 foreach ( $references
as &$ref ) {
1376 $ref = self::escapeIdForAttribute( $ref );
1379 # Merge the array back to a space delimited list string
1380 # If the array is empty, the result will be an empty string ('')
1381 $referenceString = implode(
' ', $references );
1383 return $referenceString;
1397 static function escapeClass( $class ) {
1399 return rtrim( preg_replace(
1400 [
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/' ],
1412 static function escapeHtmlAllowEntities(
$html ) {
1414 # It seems wise to escape ' as well as ", as a matter of course. Can't
1415 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1416 # don't cause the entire string to disappear.
1417 $html = htmlspecialchars(
$html, ENT_QUOTES | ENT_SUBSTITUTE );
1429 public static function decodeTagAttributes( $text ) {
1430 if ( trim( $text ) ==
'' ) {
1436 if ( !preg_match_all(
1437 self::getAttribsRegex(),
1440 PREG_SET_ORDER ) ) {
1444 foreach ( $pairs
as $set ) {
1445 $attribute = strtolower( $set[1] );
1446 $value = self::getTagAttributeCallback( $set );
1465 public static function safeEncodeTagAttributes( $assoc_array ) {
1467 foreach ( $assoc_array
as $attribute =>
$value ) {
1468 $encAttribute = htmlspecialchars( $attribute );
1469 $encValue = self::safeEncodeAttribute(
$value );
1471 $attribs[] =
"$encAttribute=\"$encValue\"";
1484 private static function getTagAttributeCallback( $set ) {
1485 if ( isset( $set[5] ) ) {
1488 } elseif ( isset( $set[4] ) ) {
1491 } elseif ( isset( $set[3] ) ) {
1494 } elseif ( !isset( $set[2] ) ) {
1495 # In XHTML, attributes must have a value so return an empty string.
1496 # See "Empty attribute syntax",
1497 # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1500 throw new MWException(
"Tag conditions not met. This should never happen and is a bug." );
1508 private static function normalizeWhitespace( $text ) {
1509 return preg_replace(
1510 '/\r\n|[\x20\x0d\x0a\x09]/',
1523 static function normalizeSectionNameWhitespace(
$section ) {
1524 return trim( preg_replace(
'/[ _]+/',
' ',
$section ) );
1542 static function normalizeCharReferences( $text ) {
1543 return preg_replace_callback(
1544 self::CHAR_REFS_REGEX,
1545 [
self::class,
'normalizeCharReferencesCallback' ],
1553 static function normalizeCharReferencesCallback(
$matches ) {
1562 if ( is_null(
$ret ) ) {
1563 return htmlspecialchars(
$matches[0] );
1579 static function normalizeEntity(
$name ) {
1580 if ( isset( self::$htmlEntityAliases[
$name] ) ) {
1581 return '&' . self::$htmlEntityAliases[
$name] .
';';
1582 } elseif ( in_array(
$name, [
'lt',
'gt',
'amp',
'quot' ] ) ) {
1584 } elseif ( isset( self::$htmlEntities[
$name] ) ) {
1585 return '&#' . self::$htmlEntities[
$name] .
';';
1587 return "&$name;";
1595 static function decCharReference( $codepoint ) {
1596 $point = intval( $codepoint );
1597 if ( self::validateCodepoint( $point ) ) {
1598 return sprintf(
'&#%d;', $point );
1608 static function hexCharReference( $codepoint ) {
1609 $point = hexdec( $codepoint );
1610 if ( self::validateCodepoint( $point ) ) {
1611 return sprintf(
'&#x%x;', $point );
1623 private static function validateCodepoint( $codepoint ) {
1624 # U+000C is valid in HTML5 but not allowed in XML.
1625 # U+000D is valid in XML but not allowed in HTML5.
1626 # U+007F - U+009F are disallowed in HTML5 (control characters).
1627 return $codepoint == 0x09
1628 || $codepoint == 0x0a
1629 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1630 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1631 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1632 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1642 public static function decodeCharReferences( $text ) {
1643 return preg_replace_callback(
1644 self::CHAR_REFS_REGEX,
1659 public static function decodeCharReferencesAndNormalize( $text ) {
1661 $text = preg_replace_callback(
1662 self::CHAR_REFS_REGEX,
1680 static function decodeCharReferencesCallback(
$matches ) {
1682 return self::decodeEntity(
$matches[1] );
1684 return self::decodeChar( intval(
$matches[2] ) );
1686 return self::decodeChar( hexdec(
$matches[3] ) );
1688 # Last case should be an ampersand by itself
1699 static function decodeChar( $codepoint ) {
1700 if ( self::validateCodepoint( $codepoint ) ) {
1715 static function decodeEntity(
$name ) {
1716 if ( isset( self::$htmlEntityAliases[
$name] ) ) {
1719 if ( isset( self::$htmlEntities[
$name] ) ) {
1732 static function attributeWhitelist( $element ) {
1733 $list = self::setupAttributeWhitelist();
1734 return isset( $list[$element] )
1744 static function setupAttributeWhitelist() {
1747 if ( $whitelist !==
null ) {
1769 # These attributes are specified in section 9 of
1770 # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1777 # Microdata. These are specified by
1778 # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1786 $block = array_merge( $common, [
'align' ] );
1787 $tablealign = [
'align',
'valign' ];
1795 'nowrap', # deprecated
1796 'width', # deprecated
1797 'height', # deprecated
1798 'bgcolor', # deprecated
1801 # Numbers refer to sections in HTML 4.01 standard describing the element.
1802 # See: https://www.w3.org/TR/html4/
1806 'center' => $common, # deprecated
1825 'strong' => $common,
1836 'blockquote' => array_merge( $common, [
'cite' ] ),
1837 'q' => array_merge( $common, [
'cite' ] ),
1847 'br' => array_merge( $common, [
'clear' ] ),
1849 # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1853 'pre' => array_merge( $common, [
'width' ] ),
1856 'ins' => array_merge( $common, [
'cite',
'datetime' ] ),
1857 'del' => array_merge( $common, [
'cite',
'datetime' ] ),
1860 'ul' => array_merge( $common, [
'type' ] ),
1861 'ol' => array_merge( $common, [
'type',
'start',
'reversed' ] ),
1862 'li' => array_merge( $common, [
'type',
'value' ] ),
1870 'table' => array_merge( $common,
1871 [
'summary',
'width',
'border',
'frame',
1872 'rules',
'cellspacing',
'cellpadding',
1877 'caption' => $block,
1885 'colgroup' => array_merge( $common, [
'span' ] ),
1886 'col' => array_merge( $common, [
'span' ] ),
1889 'tr' => array_merge( $common, [
'bgcolor' ], $tablealign ),
1892 'td' => array_merge( $common, $tablecell, $tablealign ),
1893 'th' => array_merge( $common, $tablecell, $tablealign ),
1896 # NOTE: <a> is not allowed directly, but the attrib
1897 # whitelist is used from the Parser object
1898 'a' => array_merge( $common, [
'href',
'rel',
'rev' ] ), # rel/rev esp.
for RDFa
1901 # Not usually allowed, but may be used for extension-style hooks
1902 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1904 'img' => array_merge( $common, [
'alt',
'src',
'width',
'height',
'srcset' ] ),
1906 'video' => array_merge( $common, [
'poster',
'controls',
'preload',
'width',
'height' ] ),
1907 'source' => array_merge( $common, [
'type',
'src' ] ),
1908 'track' => array_merge( $common, [
'type',
'src',
'srclang',
'kind',
'label' ] ),
1916 'strike' => $common,
1921 'font' => array_merge( $common, [
'size',
'color',
'face' ] ),
1925 'hr' => array_merge( $common, [
'width' ] ),
1927 # HTML Ruby annotation text module, simple ruby only.
1928 # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1933 'rt' => $common, # array_merge( $common,
array(
'rbspan' ) ),
1936 # MathML root element, where used for extensions
1937 # 'title' may not be 100% valid here; it's XHTML
1938 # https://www.w3.org/TR/REC-MathML/
1939 'math' => [
'class',
'style',
'id',
'title' ],
1942 'figure' => $common,
1943 'figcaption' => $common,
1945 # HTML 5 section 4.6
1948 # HTML5 elements, defined by:
1949 # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1950 'data' => array_merge( $common, [
'value' ] ),
1951 'time' => array_merge( $common, [
'datetime' ] ),
1959 'meta' => [
'itemprop',
'content' ],
1960 'link' => [
'itemprop',
'href',
'title' ],
1976 static function stripAllTags(
$html ) {
1979 $tokenizer =
new RemexHtml\Tokenizer\Tokenizer(
$handler,
$html, [
1980 'ignoreErrors' =>
true,
1982 'ignoreNulls' =>
true,
1983 'skipPreprocess' =>
true,
1985 $tokenizer->execute();
1988 $text = self::normalizeWhitespace( $text );
2001 static function hackDocType() {
2002 $out =
"<!DOCTYPE html [\n";
2003 foreach ( self::$htmlEntities
as $entity => $codepoint ) {
2004 $out .=
"<!ENTITY $entity \"&#$codepoint;\">";
2014 static function cleanUrl( $url ) {
2015 # Normalize any HTML entities in input. They will be
2016 # re-escaped by makeExternalLink().
2017 $url = self::decodeCharReferences( $url );
2019 # Escape any control characters introduced by the above step
2020 $url = preg_replace_callback(
'/[\][<>"\\x00-\\x20\\x7F\|]/',
2021 [ __CLASS__,
'cleanUrlCallback' ], $url );
2023 # Validate hostname portion
2025 if ( preg_match(
'!^([^:]+:)(//[^/]+)?(.*)$!iD', $url,
$matches ) ) {
2032 \\s| # general whitespace
2033 \xc2\xad| # 00ad SOFT HYPHEN
2034 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
2035 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
2036 \xe2\x81\xa0| # 2060 WORD JOINER
2037 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
2038 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
2039 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
2040 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
2041 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
2042 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
2043 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
2044 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
2047 $host = preg_replace( $strip,
'', $host );
2050 if ( substr_compare(
"//%5B", $host, 0, 5 ) === 0 &&
2051 preg_match(
'!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host,
$matches )
2058 return $protocol . $host . $rest;
2068 static function cleanUrlCallback(
$matches ) {
2100 public static function validateEmail( $addr ) {
2109 $rfc5322_atext =
"a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
2110 $rfc1034_ldh_str =
"a-z0-9\\-";
2112 $html5_email_regexp =
"/
2114 [$rfc5322_atext\\.]+ # user part which is liberal :p
2116 [$rfc1034_ldh_str]+ # First domain part
2117 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
2121 return (
bool)preg_match( $html5_email_regexp, $addr );