37 '/&([A-Za-z0-9\x80-\xff]+);
39 |&\#[xX]([0-9A-Fa-f]+);
356 if ( self::$attribsRegex ===
null ) {
357 $attribFirst =
"[:_\p{L}\p{N}]";
358 $attrib =
"[:_\.\-\p{L}\p{N}]";
359 $space =
'[\x09\x0a\x0c\x0d\x20]';
360 self::$attribsRegex =
361 "/(?:^|$space)({$attribFirst}{$attrib}*)
364 # The attribute value: quoted or alone
369 )?(?=$space|\$)/sxu";
383 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
384 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
389 if ( !$staticInitialised || $staticInitialised != $globalContext ) {
390 $htmlpairsStatic = [ # Tags
that must be closed
391 'b',
'bdi',
'del',
'i',
'ins',
'u',
'font',
'big',
'small',
'sub',
'sup',
'h1',
392 'h2',
'h3',
'h4',
'h5',
'h6',
'cite',
'code',
'em',
's',
393 'strike',
'strong',
'tt',
'var',
'div',
'center',
394 'blockquote',
'ol',
'ul',
'dl',
'table',
'caption',
'pre',
395 'ruby',
'rb',
'rp',
'rt',
'rtc',
'p',
'span',
'abbr',
'dfn',
396 'kbd',
'samp',
'data',
'time',
'mark'
399 'br',
'wbr',
'hr',
'li',
'dt',
'dd',
'meta',
'link'
402 # Elements that cannot have close tags. This is (not coincidentally)
403 # also the list of tags for which the HTML 5 parsing algorithm
404 # requires you to "acknowledge the token's self-closing flag", i.e.
405 # a self-closing tag like <br/> is not an HTML 5 parse error only
408 'br',
'wbr',
'hr',
'meta',
'link'
411 $htmlnest = [ # Tags
that can be nested--??
412 'table',
'tr',
'td',
'th',
'div',
'blockquote',
'ol',
'ul',
413 'li',
'dl',
'dt',
'dd',
'font',
'big',
'small',
'sub',
'sup',
'span',
414 'var',
'kbd',
'samp',
'em',
'strong',
'q',
'ruby',
'bdo'
416 $tabletags = [ # Can only appear inside
table, we
will close
them
422 $listtags = [ # Tags
that can appear
in a
list
427 $htmlsingle[] =
'img';
428 $htmlsingleonly[] =
'img';
431 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
432 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
434 # Convert them all to hashtables for faster lookup
435 $vars = [
'htmlpairsStatic',
'htmlsingle',
'htmlsingleonly',
'htmlnest',
'tabletags',
436 'htmllist',
'listtags',
'htmlsingleallowed',
'htmlelementsStatic' ];
438 $$var = array_flip( $$var );
440 $staticInitialised = $globalContext;
443 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
444 $extratags = array_flip( $extratags );
445 $removetags = array_flip( $removetags );
446 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
447 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
450 'htmlpairs' => $htmlpairs,
451 'htmlsingle' => $htmlsingle,
452 'htmlsingleonly' => $htmlsingleonly,
453 'htmlnest' => $htmlnest,
454 'tabletags' => $tabletags,
455 'htmllist' => $htmllist,
456 'listtags' => $listtags,
457 'htmlsingleallowed' => $htmlsingleallowed,
458 'htmlelements' => $htmlelements,
478 $args = [], $extratags = [], $removetags = [], $warnCallback =
null
481 $htmlpairs = $tagData[
'htmlpairs'];
482 $htmlsingle = $tagData[
'htmlsingle'];
483 $htmlsingleonly = $tagData[
'htmlsingleonly'];
484 $htmlnest = $tagData[
'htmlnest'];
485 $tabletags = $tagData[
'tabletags'];
486 $htmllist = $tagData[
'htmllist'];
487 $listtags = $tagData[
'listtags'];
488 $htmlsingleallowed = $tagData[
'htmlsingleallowed'];
489 $htmlelements = $tagData[
'htmlelements'];
491 # Remove HTML comments
493 $bits = explode(
'<', $text );
494 $text = str_replace(
'>',
'>', array_shift( $bits ) );
496 $tagstack = $tablestack = [];
497 foreach ( $bits
as $x ) {
499 # $slash: Does the current element start with a '/'?
500 # $t: Current element name
501 # $params: String between element name and >
502 # $brace: Ending '>' or '/>'
503 # $rest: Everything until the next element of $bits
504 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
507 $slash =
$t =
$params = $brace = $rest =
null;
511 $t = strtolower(
$t );
512 if ( isset( $htmlelements[
$t] ) ) {
514 if ( $slash && isset( $htmlsingleonly[
$t] ) ) {
516 } elseif ( $slash ) {
517 # Closing a tag... is it the one we just opened?
518 Wikimedia\suppressWarnings();
519 $ot = array_pop( $tagstack );
520 Wikimedia\restoreWarnings();
523 if ( isset( $htmlsingleallowed[$ot] ) ) {
524 # Pop all elements with an optional close tag
525 # and see if we find a match below them
527 array_push( $optstack, $ot );
528 Wikimedia\suppressWarnings();
529 $ot = array_pop( $tagstack );
530 Wikimedia\restoreWarnings();
531 while ( $ot !=
$t && isset( $htmlsingleallowed[$ot] ) ) {
532 array_push( $optstack, $ot );
533 Wikimedia\suppressWarnings();
534 $ot = array_pop( $tagstack );
535 Wikimedia\restoreWarnings();
538 # No match. Push the optional elements back again
540 Wikimedia\suppressWarnings();
541 $ot = array_pop( $optstack );
542 Wikimedia\restoreWarnings();
544 array_push( $tagstack, $ot );
545 Wikimedia\suppressWarnings();
546 $ot = array_pop( $optstack );
547 Wikimedia\restoreWarnings();
551 Wikimedia\suppressWarnings();
552 array_push( $tagstack, $ot );
553 Wikimedia\restoreWarnings();
555 # <li> can be nested in <ul> or <ol>, skip those cases:
556 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[
$t] ) ) {
561 if (
$t ==
'table' ) {
562 $tagstack = array_pop( $tablestack );
567 # Keep track for later
568 if ( isset( $tabletags[
$t] ) && !in_array(
'table', $tagstack ) ) {
570 } elseif ( in_array(
$t, $tagstack ) && !isset( $htmlnest[
$t] ) ) {
572 # Is it a self closed htmlpair ? (T7487)
573 } elseif ( $brace ==
'/>' && isset( $htmlpairs[
$t] ) ) {
579 if ( is_callable( $warnCallback ) ) {
580 call_user_func_array( $warnCallback, [
'deprecated-self-close-category' ] );
583 } elseif ( isset( $htmlsingleonly[
$t] ) ) {
584 # Hack to force empty tag for unclosable elements
586 } elseif ( isset( $htmlsingle[
$t] ) ) {
587 # Hack to not close $htmlsingle tags
589 # Still need to push this optionally-closed tag to
590 # the tag stack so that we can match end tags
591 # instead of marking them as bad.
592 array_push( $tagstack,
$t );
593 } elseif ( isset( $tabletags[
$t] ) && in_array(
$t, $tagstack ) ) {
597 if (
$t ==
'table' ) {
598 array_push( $tablestack, $tagstack );
601 array_push( $tagstack,
$t );
604 # Replace any variables or template parameters with
606 if ( is_callable( $processCallback ) ) {
607 call_user_func_array( $processCallback, [ &
$params,
$args ] );
610 if ( !self::validateTag(
$params,
$t ) ) {
614 # Strip non-approved attributes from the tag
618 $rest = str_replace(
'>',
'>', $rest );
619 $close = ( $brace ==
'/>' && !$slash ) ?
' /' :
'';
620 $text .=
"<$slash$t$newparams$close>$rest";
624 $text .=
'<' . str_replace(
'>',
'>', $x );
626 # Close off any remaining tags
627 while ( is_array( $tagstack ) && (
$t = array_pop( $tagstack ) ) ) {
629 if (
$t ==
'table' ) {
630 $tagstack = array_pop( $tablestack );
634 # this might be possible using tidy itself
635 foreach ( $bits
as $x ) {
636 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
640 $t = strtolower(
$t );
641 if ( isset( $htmlelements[
$t] ) ) {
642 if ( is_callable( $processCallback ) ) {
643 call_user_func_array( $processCallback, [ &
$params,
$args ] );
646 if ( $brace ==
'/>' && !( isset( $htmlsingle[
$t] ) || isset( $htmlsingleonly[
$t] ) ) ) {
652 if ( is_callable( $warnCallback ) ) {
653 call_user_func_array( $warnCallback, [
'deprecated-self-close-category' ] );
656 if ( !self::validateTag(
$params,
$t ) ) {
662 if ( $brace ===
'/>' && !isset( $htmlsingleonly[
$t] ) ) {
663 # Interpret self-closing tags as empty tags even when
664 # HTML 5 would interpret them as start tags. Such input
665 # is commonly seen on Wikimedia wikis with this intention.
669 $rest = str_replace(
'>',
'>', $rest );
670 $text .=
"<$slash$t$newparams$brace$rest";
675 $text .=
'<' . str_replace(
'>',
'>', $x );
691 while ( ( $start = strpos( $text,
'<!--' ) ) !==
false ) {
692 $end = strpos( $text,
'-->', $start + 4 );
693 if ( $end ===
false ) {
694 # Unterminated comment; bail out
700 # Trim space and newline if the comment is both
701 # preceded and followed by a newline
702 $spaceStart = max( $start - 1, 0 );
703 $spaceLen = $end - $spaceStart;
704 while ( substr( $text, $spaceStart, 1 ) ===
' ' && $spaceStart > 0 ) {
708 while ( substr( $text, $spaceStart + $spaceLen, 1 ) ===
' ' ) {
711 if ( substr( $text, $spaceStart, 1 ) ===
"\n"
712 && substr( $text, $spaceStart + $spaceLen, 1 ) ===
"\n" ) {
713 # Remove the comment, leading and trailing
714 # spaces, and leave only one newline.
715 $text = substr_replace( $text,
"\n", $spaceStart, $spaceLen + 1 );
717 # Remove just the comment.
718 $text = substr_replace( $text,
'', $start, $end - $start );
739 if ( $element ==
'meta' || $element ==
'link' ) {
740 if ( !isset(
$params[
'itemprop'] ) ) {
744 if ( $element ==
'meta' && !isset(
$params[
'content'] ) ) {
748 if ( $element ==
'link' && !isset(
$params[
'href'] ) ) {
774 self::attributeWhitelist( $element ) );
793 $whitelist = array_flip( $whitelist );
798 # Allow XML namespace declaration to allow RDFa
799 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
800 if ( !preg_match( self::EVIL_URI_PATTERN,
$value ) ) {
807 # Allow any attribute beginning with "data-"
809 # * Disallow data attributes used by MediaWiki code
810 # * Ensure that the attribute is not namespaced by banning
812 if ( !preg_match(
'/^data-[^:]*$/i', $attribute )
813 && !isset( $whitelist[$attribute] )
814 || self::isReservedDataAttribute( $attribute )
819 # Strip javascript "expression" from stylesheets.
820 # https://msdn.microsoft.com/en-us/library/ms537634.aspx
821 if ( $attribute ==
'style' ) {
825 # Escape HTML id attributes
826 if ( $attribute ===
'id' ) {
830 # Escape HTML id reference lists
831 if ( $attribute ===
'aria-describedby'
832 || $attribute ===
'aria-flowto'
833 || $attribute ===
'aria-labelledby'
834 || $attribute ===
'aria-owns'
841 if ( $attribute ===
'rel' || $attribute ===
'rev'
843 || $attribute ===
'about' || $attribute ===
'property'
844 || $attribute ===
'resource' || $attribute ===
'datatype'
845 || $attribute ===
'typeof'
847 || $attribute ===
'itemid' || $attribute ===
'itemprop'
848 || $attribute ===
'itemref' || $attribute ===
'itemscope'
849 || $attribute ===
'itemtype'
852 if ( preg_match( self::EVIL_URI_PATTERN,
$value ) ) {
857 # NOTE: even though elements using href/src are not allowed directly, supply
858 # validation code that can be used by tag hook handlers, etc
859 if ( $attribute ===
'href' || $attribute ===
'src' || $attribute ===
'poster' ) {
860 if ( !preg_match( $hrefExp,
$value ) ) {
871 # itemtype, itemid, itemref don't make sense without itemscope
872 if ( !array_key_exists(
'itemscope',
$out ) ) {
873 unset(
$out[
'itemtype'] );
874 unset(
$out[
'itemid'] );
875 unset(
$out[
'itemref'] );
877 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
897 return (
bool)preg_match(
'/^data-(ooui|mw|parsoid)/i', $attr );
911 $out = array_merge( $a, $b );
912 if ( isset( $a[
'class'] ) && isset( $b[
'class'] )
913 && is_string( $a[
'class'] ) && is_string( $b[
'class'] )
914 && $a[
'class'] !== $b[
'class']
916 $classes = preg_split(
'/\s+/',
"{$a['class']} {$b['class']}",
917 -1, PREG_SPLIT_NO_EMPTY );
918 $out[
'class'] = implode(
' ', array_unique( $classes ) );
946 if ( !$decodeRegex ) {
947 $space =
'[\\x20\\t\\r\\n\\f]';
948 $nl =
'(?:\\n|\\r\\n|\\r|\\f)';
950 $decodeRegex =
"/ $backslash
952 ($nl) | # 1. Line continuation
953 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
954 (.) | # 3. backslash cancelling special meaning
955 () | # 4. backslash at end of string
958 $value = preg_replace_callback( $decodeRegex,
959 [ __CLASS__,
'cssDecodeCallback' ],
$value );
962 $value = preg_replace_callback(
966 if ( $cp ===
false ) {
969 return chr( $cp - 65248 );
977 [
'ʀ',
'ɴ',
'ⁿ',
'ʟ',
'ɪ',
'⁽',
'₍' ],
978 [
'r',
'n',
'n',
'l',
'i',
'(',
'(' ],
985 if ( !preg_match(
'! ^ \s* /\* [^*\\/]* \*/ \s* $ !x',
$value ) ) {
996 $commentPos = strpos(
$value,
'/*' );
997 if ( $commentPos !==
false ) {
1006 \xE3\x80\xB1 | # U+3031
1007 \xE3\x82\x9D | # U+309D
1008 \xE3\x83\xBC | # U+30FC
1009 \xE3\x83\xBD | # U+30FD
1010 \xEF\xB9\xBC | # U+FE7C
1011 \xEF\xB9\xBD | # U+FE7D
1012 \xEF\xBD\xB0 # U+FF70
1043 if ( preg_match(
'/[\000-\010\013\016-\037\177]/',
$value ) ||
1045 return '/* invalid control char */';
1046 } elseif ( preg_match(
1051 | -o-link-source\s*:
1056 | attr\s*\([^)]+[\s,]+url
1059 return '/* insecure input */';
1079 if ( $char ==
"\n" || $char ==
'"' || $char ==
"'" || $char ==
'\\' ) {
1082 return '\\' . dechex( ord( $char ) ) .
' ';
1111 if ( trim( $text ) ==
'' ) {
1131 $encValue = htmlspecialchars( $text, ENT_QUOTES );
1136 $encValue = strtr( $encValue, [
1154 # Templates and links may be expanded in later parsing,
1155 # creating invalid or dangerous output. Suppress this.
1156 $encValue = strtr( $encValue, [
1164 "''" =>
'''',
1165 'ISBN' =>
'ISBN',
1167 'PMID' =>
'PMID',
1173 $encValue = preg_replace_callback(
1176 return str_replace(
':',
':',
$matches[1] );
1220 $id = preg_replace(
'/[ \t\n\r\f_\'"&#%]+/',
'_', $id );
1221 $id = trim( $id,
'_' );
1236 $id = urlencode( strtr( $id,
' ',
'_' ) );
1237 $id = strtr( $id, $replace );
1239 if ( !preg_match(
'/^[a-zA-Z]/', $id ) && !in_array(
'noninitial',
$options ) ) {
1265 if ( $mode === self::ID_PRIMARY ) {
1266 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
1292 throw new UnexpectedValueException(
'$wgFragmentMode is configured with no primary mode' );
1329 $id = mb_substr( $id, 0, 1024 );
1333 $id = str_replace(
' ',
'_', $id );
1342 $id = urlencode( str_replace(
' ',
'_', $id ) );
1343 $id = strtr( $id, $replace );
1345 case 'html5-legacy':
1346 $id = preg_replace(
'/[ \t\n\r\f_\'"&#%]+/',
'_', $id );
1347 $id = trim( $id,
'_' );
1354 throw new InvalidArgumentException(
"Invalid mode '$mode' passed to '" . __METHOD__ );
1376 # Explode the space delimited list string into an array of tokens
1377 $references = preg_split(
'/\s+/',
"{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1379 # Escape each token as an id
1380 foreach ( $references
as &$ref ) {
1384 # Merge the array back to a space delimited list string
1385 # If the array is empty, the result will be an empty string ('')
1386 $referenceString = implode(
' ', $references );
1388 return $referenceString;
1404 return rtrim( preg_replace(
1405 [
'/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/',
'/_+/' ],
1419 # It seems wise to escape ' as well as ", as a matter of course. Can't
1420 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1421 # don't cause the entire string to disappear.
1422 $html = htmlspecialchars(
$html, ENT_QUOTES | ENT_SUBSTITUTE );
1435 if ( trim( $text ) ==
'' ) {
1441 if ( !preg_match_all(
1442 self::getAttribsRegex(),
1445 PREG_SET_ORDER ) ) {
1449 foreach ( $pairs
as $set ) {
1450 $attribute = strtolower( $set[1] );
1472 foreach ( $assoc_array
as $attribute =>
$value ) {
1473 $encAttribute = htmlspecialchars( $attribute );
1476 $attribs[] =
"$encAttribute=\"$encValue\"";
1490 if ( isset( $set[5] ) ) {
1493 } elseif ( isset( $set[4] ) ) {
1496 } elseif ( isset( $set[3] ) ) {
1499 } elseif ( !isset( $set[2] ) ) {
1500 # In XHTML, attributes must have a value so return an empty string.
1501 # See "Empty attribute syntax",
1502 # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1505 throw new MWException(
"Tag conditions not met. This should never happen and is a bug." );
1514 return preg_replace(
1515 '/\r\n|[\x20\x0d\x0a\x09]/',
1529 return trim( preg_replace(
'/[ _]+/',
' ',
$section ) );
1548 return preg_replace_callback(
1549 self::CHAR_REFS_REGEX,
1550 [
self::class,
'normalizeCharReferencesCallback' ],
1567 if ( is_null(
$ret ) ) {
1568 return htmlspecialchars(
$matches[0] );
1585 if ( isset( self::$htmlEntityAliases[
$name] ) ) {
1586 return '&' . self::$htmlEntityAliases[
$name] .
';';
1587 } elseif ( in_array(
$name, [
'lt',
'gt',
'amp',
'quot' ] ) ) {
1589 } elseif ( isset( self::$htmlEntities[
$name] ) ) {
1590 return '&#' . self::$htmlEntities[
$name] .
';';
1592 return "&$name;";
1601 $point = intval( $codepoint );
1602 if ( self::validateCodepoint( $point ) ) {
1603 return sprintf(
'&#%d;', $point );
1614 $point = hexdec( $codepoint );
1615 if ( self::validateCodepoint( $point ) ) {
1616 return sprintf(
'&#x%x;', $point );
1629 # U+000C is valid in HTML5 but not allowed in XML.
1630 # U+000D is valid in XML but not allowed in HTML5.
1631 # U+007F - U+009F are disallowed in HTML5 (control characters).
1632 return $codepoint == 0x09
1633 || $codepoint == 0x0a
1634 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1635 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1636 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1637 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1648 return preg_replace_callback(
1649 self::CHAR_REFS_REGEX,
1666 $text = preg_replace_callback(
1667 self::CHAR_REFS_REGEX,
1693 # Last case should be an ampersand by itself
1705 if ( self::validateCodepoint( $codepoint ) ) {
1721 if ( isset( self::$htmlEntityAliases[
$name] ) ) {
1724 if ( isset( self::$htmlEntities[
$name] ) ) {
1739 return isset( $list[$element] )
1752 if ( $whitelist !==
null ) {
1774 # These attributes are specified in section 9 of
1775 # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1782 # Microdata. These are specified by
1783 # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1791 $block = array_merge( $common, [
'align' ] );
1792 $tablealign = [
'align',
'valign' ];
1800 'nowrap', # deprecated
1801 'width', # deprecated
1802 'height', # deprecated
1803 'bgcolor', # deprecated
1806 # Numbers refer to sections in HTML 4.01 standard describing the element.
1807 # See: https://www.w3.org/TR/html4/
1811 'center' => $common, # deprecated
1830 'strong' => $common,
1841 'blockquote' => array_merge( $common, [
'cite' ] ),
1842 'q' => array_merge( $common, [
'cite' ] ),
1852 'br' => array_merge( $common, [
'clear' ] ),
1854 # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1858 'pre' => array_merge( $common, [
'width' ] ),
1861 'ins' => array_merge( $common, [
'cite',
'datetime' ] ),
1862 'del' => array_merge( $common, [
'cite',
'datetime' ] ),
1865 'ul' => array_merge( $common, [
'type' ] ),
1866 'ol' => array_merge( $common, [
'type',
'start',
'reversed' ] ),
1867 'li' => array_merge( $common, [
'type',
'value' ] ),
1875 'table' => array_merge( $common,
1876 [
'summary',
'width',
'border',
'frame',
1877 'rules',
'cellspacing',
'cellpadding',
1882 'caption' => $block,
1890 'colgroup' => array_merge( $common, [
'span' ] ),
1891 'col' => array_merge( $common, [
'span' ] ),
1894 'tr' => array_merge( $common, [
'bgcolor' ], $tablealign ),
1897 'td' => array_merge( $common, $tablecell, $tablealign ),
1898 'th' => array_merge( $common, $tablecell, $tablealign ),
1901 # NOTE: <a> is not allowed directly, but the attrib
1902 # whitelist is used from the Parser object
1903 'a' => array_merge( $common, [
'href',
'rel',
'rev' ] ), # rel/rev esp.
for RDFa
1906 # Not usually allowed, but may be used for extension-style hooks
1907 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1909 'img' => array_merge( $common, [
'alt',
'src',
'width',
'height',
'srcset' ] ),
1911 'video' => array_merge( $common, [
'poster',
'controls',
'preload',
'width',
'height' ] ),
1912 'source' => array_merge( $common, [
'type',
'src' ] ),
1913 'track' => array_merge( $common, [
'type',
'src',
'srclang',
'kind',
'label' ] ),
1921 'strike' => $common,
1926 'font' => array_merge( $common, [
'size',
'color',
'face' ] ),
1930 'hr' => array_merge( $common, [
'width' ] ),
1932 # HTML Ruby annotation text module, simple ruby only.
1933 # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1938 'rt' => $common, # array_merge( $common,
array(
'rbspan' ) ),
1941 # MathML root element, where used for extensions
1942 # 'title' may not be 100% valid here; it's XHTML
1943 # https://www.w3.org/TR/REC-MathML/
1944 'math' => [
'class',
'style',
'id',
'title' ],
1947 'figure' => $common,
1948 'figcaption' => $common,
1950 # HTML 5 section 4.6
1953 # HTML5 elements, defined by:
1954 # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1955 'data' => array_merge( $common, [
'value' ] ),
1956 'time' => array_merge( $common, [
'datetime' ] ),
1964 'meta' => [
'itemprop',
'content' ],
1965 'link' => [
'itemprop',
'href',
'title' ],
1984 $tokenizer =
new RemexHtml\Tokenizer\Tokenizer(
$handler,
$html, [
1985 'ignoreErrors' =>
true,
1987 'ignoreNulls' =>
true,
1988 'skipPreprocess' =>
true,
1990 $tokenizer->execute();
2007 $out =
"<!DOCTYPE html [\n";
2008 foreach ( self::$htmlEntities
as $entity => $codepoint ) {
2009 $out .=
"<!ENTITY $entity \"&#$codepoint;\">";
2020 # Normalize any HTML entities in input. They will be
2021 # re-escaped by makeExternalLink().
2024 # Escape any control characters introduced by the above step
2025 $url = preg_replace_callback(
'/[\][<>"\\x00-\\x20\\x7F\|]/',
2026 [ __CLASS__,
'cleanUrlCallback' ], $url );
2028 # Validate hostname portion
2030 if ( preg_match(
'!^([^:]+:)(//[^/]+)?(.*)$!iD', $url,
$matches ) ) {
2037 \\s| # general whitespace
2038 \xc2\xad| # 00ad SOFT HYPHEN
2039 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
2040 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
2041 \xe2\x81\xa0| # 2060 WORD JOINER
2042 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
2043 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
2044 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
2045 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
2046 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
2047 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
2048 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
2049 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
2052 $host = preg_replace( $strip,
'', $host );
2055 if ( substr_compare(
"//%5B", $host, 0, 5 ) === 0 &&
2056 preg_match(
'!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host,
$matches )
2063 return $protocol . $host . $rest;
2114 $rfc5322_atext =
"a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
2115 $rfc1034_ldh_str =
"a-z0-9\\-";
2117 $html5_email_regexp =
"/
2119 [$rfc5322_atext\\.]+ # user part which is liberal :p
2121 [$rfc1034_ldh_str]+ # First domain part
2122 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
2126 return (
bool)preg_match( $html5_email_regexp, $addr );