Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
63.44% |
406 / 640 |
|
61.22% |
30 / 49 |
CRAP | |
0.00% |
0 / 1 |
| Sanitizer | |
63.54% |
406 / 639 |
|
61.22% |
30 / 49 |
1979.18 | |
0.00% |
0 / 1 |
| getAttribsRegex | |
18.18% |
2 / 11 |
|
0.00% |
0 / 1 |
4.19 | |||
| getAttribNameRegex | |
40.00% |
2 / 5 |
|
0.00% |
0 / 1 |
2.86 | |||
| getRecognizedTagData | |
40.00% |
24 / 60 |
|
0.00% |
0 / 1 |
21.82 | |||
| internalRemoveHtmlTags | |
96.43% |
27 / 28 |
|
0.00% |
0 / 1 |
12 | |||
| removeSomeTags | |
100.00% |
29 / 29 |
|
100.00% |
1 / 1 |
1 | |||
| removeHTMLcomments | |
70.59% |
12 / 17 |
|
0.00% |
0 / 1 |
9.63 | |||
| validateTag | |
77.78% |
7 / 9 |
|
0.00% |
0 / 1 |
8.70 | |||
| validateTagAttributes | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| validateAttributes | |
91.30% |
42 / 46 |
|
0.00% |
0 / 1 |
36.85 | |||
| isReservedDataAttribute | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| mergeAttributes | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
42 | |||
| normalizeCss | |
61.11% |
11 / 18 |
|
0.00% |
0 / 1 |
4.94 | |||
| checkCss | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
| cssDecodeCallback | |
80.00% |
8 / 10 |
|
0.00% |
0 / 1 |
8.51 | |||
| fixTagAttributes | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
3.03 | |||
| encodeAttribute | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
| armorFrenchSpaces | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
| safeEncodeAttribute | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
1 | |||
| escapeIdForAttribute | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
| escapeIdForLink | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
| escapeIdForExternalInterwiki | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| escapeIdInternalUrl | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
| escapeIdInternal | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
4 | |||
| escapeIdReferenceListInternal | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
| escapeClass | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
| escapeCombiningChar | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| escapeHtmlAllowEntities | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| decodeTagAttributes | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
5 | |||
| safeEncodeTagAttributes | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
| getTagAttributeCallback | |
88.89% |
8 / 9 |
|
0.00% |
0 / 1 |
5.03 | |||
| normalizeWhitespace | |
60.00% |
3 / 5 |
|
0.00% |
0 / 1 |
2.26 | |||
| normalizeSectionNameWhitespace | |
60.00% |
3 / 5 |
|
0.00% |
0 / 1 |
2.26 | |||
| normalizeCharReferences | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
| normalizeCharReferencesCallback | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
5 | |||
| normalizeEntity | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
| decCharReference | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
| hexCharReference | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
| validateCodepoint | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
10 | |||
| decodeCharReferences | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
| decodeCharReferencesAndNormalize | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
2 | |||
| decodeCharReferencesCallback | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
5 | |||
| decodeChar | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
| decodeEntity | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 | |||
| attributesAllowedInternal | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| setupAttributesAllowedInternal | |
2.21% |
3 / 136 |
|
0.00% |
0 / 1 |
5.74 | |||
| stripAllTags | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
| hackDocType | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
| cleanUrl | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
4 | |||
| validateEmail | |
91.67% |
11 / 12 |
|
0.00% |
0 / 1 |
2.00 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * HTML sanitizer for %MediaWiki. |
| 4 | * |
| 5 | * Copyright © 2002-2005 Brooke Vibber <bvibber@wikimedia.org> et al |
| 6 | * https://www.mediawiki.org/ |
| 7 | * |
| 8 | * @license GPL-2.0-or-later |
| 9 | * @file |
| 10 | * @ingroup Parser |
| 11 | */ |
| 12 | |
| 13 | namespace MediaWiki\Parser; |
| 14 | |
| 15 | use InvalidArgumentException; |
| 16 | use LogicException; |
| 17 | use MediaWiki\HookContainer\HookRunner; |
| 18 | use MediaWiki\MediaWikiServices; |
| 19 | use MediaWiki\Tidy\RemexCompatFormatter; |
| 20 | use UnexpectedValueException; |
| 21 | use Wikimedia\RemexHtml\HTMLData; |
| 22 | use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer; |
| 23 | use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer; |
| 24 | use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher; |
| 25 | use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder; |
| 26 | use Wikimedia\StringUtils\StringUtils; |
| 27 | |
| 28 | /** |
| 29 | * HTML sanitizer for MediaWiki |
| 30 | * @ingroup Parser |
| 31 | */ |
| 32 | class Sanitizer { |
| 33 | /** |
| 34 | * Regular expression to match various types of character references in |
| 35 | * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences. |
| 36 | * Note that HTML5 allows some named entities to omit the trailing |
| 37 | * semicolon; wikitext entities *must* have a trailing semicolon. |
| 38 | */ |
| 39 | private const CHAR_REFS_REGEX = |
| 40 | '/&([A-Za-z0-9\x80-\xff]+;) |
| 41 | |&\#([0-9]+); |
| 42 | |&\#[xX]([0-9A-Fa-f]+); |
| 43 | |&/x'; |
| 44 | |
| 45 | /** |
| 46 | * Acceptable tag name charset from HTML5 parsing spec |
| 47 | * https://www.w3.org/TR/html5/syntax.html#tag-open-state |
| 48 | */ |
| 49 | private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!'; |
| 50 | |
| 51 | /** |
| 52 | * Pattern matching evil uris like javascript: |
| 53 | * WARNING: DO NOT use this in any place that actually requires denying |
| 54 | * certain URIs for security reasons. There are NUMEROUS[1] ways to bypass |
| 55 | * pattern-based deny lists; the only way to be secure from javascript: |
| 56 | * uri based xss vectors is to allow only things that you know are safe |
| 57 | * and deny everything else. |
| 58 | * [1]: http://ha.ckers.org/xss.html |
| 59 | */ |
| 60 | private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; |
| 61 | private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; |
| 62 | |
| 63 | /** |
| 64 | * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding. |
| 65 | * |
| 66 | * @since 1.30 |
| 67 | */ |
| 68 | public const ID_PRIMARY = 0; |
| 69 | |
| 70 | /** |
| 71 | * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false |
| 72 | * if no fallback is configured. |
| 73 | * |
| 74 | * @since 1.30 |
| 75 | */ |
| 76 | public const ID_FALLBACK = 1; |
| 77 | |
| 78 | /** |
| 79 | * Character entity aliases accepted by MediaWiki in wikitext. |
| 80 | * These are not part of the HTML standard. |
| 81 | */ |
| 82 | private const MW_ENTITY_ALIASES = [ |
| 83 | 'רלמ;' => 'rlm;', |
| 84 | 'رلم;' => 'rlm;', |
| 85 | ]; |
| 86 | |
| 87 | /** |
| 88 | * Lazy-initialised attributes regex, see getAttribsRegex() |
| 89 | */ |
| 90 | private static ?string $attribsRegex = null; |
| 91 | |
| 92 | /** |
| 93 | * Regular expression to match HTML/XML attribute pairs within a tag. |
| 94 | * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state |
| 95 | * Used in Sanitizer::decodeTagAttributes |
| 96 | */ |
| 97 | private static function getAttribsRegex(): string { |
| 98 | if ( self::$attribsRegex === null ) { |
| 99 | $spaceChars = '\x09\x0a\x0c\x0d\x20'; |
| 100 | $space = "[{$spaceChars}]"; |
| 101 | $attrib = "[^{$spaceChars}\/>=]"; |
| 102 | $attribFirst = "(?:{$attrib}|=)"; |
| 103 | self::$attribsRegex = |
| 104 | "/({$attribFirst}{$attrib}*) |
| 105 | ($space*=$space* |
| 106 | (?: |
| 107 | # The attribute value: quoted or alone |
| 108 | \"([^\"]*)(?:\"|\$) |
| 109 | | '([^']*)(?:'|\$) |
| 110 | | (((?!$space|>).)*) |
| 111 | ) |
| 112 | )?/sxu"; |
| 113 | } |
| 114 | return self::$attribsRegex; |
| 115 | } |
| 116 | |
| 117 | /** |
| 118 | * Lazy-initialised attribute name regex, see getAttribNameRegex() |
| 119 | */ |
| 120 | private static ?string $attribNameRegex = null; |
| 121 | |
| 122 | /** |
| 123 | * Used in Sanitizer::decodeTagAttributes to filter attributes. |
| 124 | */ |
| 125 | private static function getAttribNameRegex(): string { |
| 126 | if ( self::$attribNameRegex === null ) { |
| 127 | $attribFirst = "[:_\p{L}\p{N}]"; |
| 128 | $attrib = "[:_\.\-\p{L}\p{N}]"; |
| 129 | self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu"; |
| 130 | } |
| 131 | return self::$attribNameRegex; |
| 132 | } |
| 133 | |
| 134 | /** |
| 135 | * Return the various lists of recognized tags |
| 136 | * @param string[] $extratags For any extra tags to include |
| 137 | * @param string[] $removetags For any tags (default or extra) to exclude |
| 138 | * @return array |
| 139 | * @internal |
| 140 | */ |
| 141 | public static function getRecognizedTagData( array $extratags = [], array $removetags = [] ): array { |
| 142 | static $commonCase, $staticInitialised = false; |
| 143 | $isCommonCase = ( $extratags === [] && $removetags === [] ); |
| 144 | if ( $staticInitialised && $isCommonCase && $commonCase ) { |
| 145 | return $commonCase; |
| 146 | } |
| 147 | |
| 148 | static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, |
| 149 | $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic; |
| 150 | |
| 151 | if ( !$staticInitialised ) { |
| 152 | $htmlpairsStatic = [ # Tags that must be closed |
| 153 | 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', |
| 154 | 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', |
| 155 | 'strike', 'strong', 'tt', 'var', 'div', 'center', |
| 156 | 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', |
| 157 | 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn', |
| 158 | 'kbd', 'samp', 'data', 'time', 'mark' |
| 159 | ]; |
| 160 | # These tags can be self-closed. For tags not also on |
| 161 | # $htmlsingleonly, a self-closed tag will be emitted as |
| 162 | # an empty element (open-tag/close-tag pair). |
| 163 | $htmlsingle = [ |
| 164 | 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link' |
| 165 | ]; |
| 166 | |
| 167 | # Elements that cannot have close tags. This is (not coincidentally) |
| 168 | # also the list of tags for which the HTML 5 parsing algorithm |
| 169 | # requires you to "acknowledge the token's self-closing flag", i.e. |
| 170 | # a self-closing tag like <br/> is not an HTML 5 parse error only |
| 171 | # for this list. |
| 172 | $htmlsingleonly = [ |
| 173 | 'br', 'wbr', 'hr', 'meta', 'link' |
| 174 | ]; |
| 175 | |
| 176 | $htmlnest = [ # Tags that can be nested--?? |
| 177 | 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', |
| 178 | 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', |
| 179 | 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo' |
| 180 | ]; |
| 181 | $tabletags = [ # Can only appear inside table, we will close them |
| 182 | 'td', 'th', 'tr', |
| 183 | ]; |
| 184 | $htmllist = [ # Tags used by list |
| 185 | 'ul', 'ol', |
| 186 | ]; |
| 187 | $listtags = [ # Tags that can appear in a list |
| 188 | 'li', |
| 189 | ]; |
| 190 | |
| 191 | $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); |
| 192 | $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); |
| 193 | |
| 194 | # Convert them all to hashtables for faster lookup |
| 195 | $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', |
| 196 | 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ]; |
| 197 | foreach ( $vars as $var ) { |
| 198 | $$var = array_fill_keys( $$var, true ); |
| 199 | } |
| 200 | $staticInitialised = true; |
| 201 | } |
| 202 | |
| 203 | # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays |
| 204 | $extratags = array_fill_keys( $extratags, true ); |
| 205 | $removetags = array_fill_keys( $removetags, true ); |
| 206 | $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); |
| 207 | $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags ); |
| 208 | |
| 209 | $result = [ |
| 210 | 'htmlpairs' => $htmlpairs, |
| 211 | 'htmlsingle' => $htmlsingle, |
| 212 | 'htmlsingleonly' => $htmlsingleonly, |
| 213 | 'htmlnest' => $htmlnest, |
| 214 | 'tabletags' => $tabletags, |
| 215 | 'htmllist' => $htmllist, |
| 216 | 'listtags' => $listtags, |
| 217 | 'htmlsingleallowed' => $htmlsingleallowed, |
| 218 | 'htmlelements' => $htmlelements, |
| 219 | ]; |
| 220 | if ( $isCommonCase ) { |
| 221 | $commonCase = $result; |
| 222 | } |
| 223 | return $result; |
| 224 | } |
| 225 | |
| 226 | /** |
| 227 | * Cleans up HTML, removes dangerous tags and attributes, and |
| 228 | * removes HTML comments; BEWARE there may be unmatched HTML |
| 229 | * tags in the result. |
| 230 | * |
| 231 | * @note Callers are recommended to use `::removeSomeTags()` instead |
| 232 | * of this method. `Sanitizer::removeSomeTags()` is safer and will |
| 233 | * always return well-formed HTML; however, it is significantly |
| 234 | * slower (especially for short strings where setup costs |
| 235 | * predominate). This method is for internal use by the legacy parser |
| 236 | * where we know the result will be cleaned up in a subsequent tidy pass. |
| 237 | * |
| 238 | * @param string $text Original string; see T268353 for why untainted. |
| 239 | * @param-taint $text none |
| 240 | * @param callable|null $processCallback Callback to do any variable or |
| 241 | * parameter replacements in HTML attribute values. |
| 242 | * This argument should be considered @internal. |
| 243 | * @param-taint $processCallback exec_shell |
| 244 | * @param array|bool $args Arguments for the processing callback |
| 245 | * @param-taint $args none |
| 246 | * @param array $extratags For any extra tags to include |
| 247 | * @param-taint $extratags tainted |
| 248 | * @param array $removetags For any tags (default or extra) to exclude |
| 249 | * @param-taint $removetags none |
| 250 | * @return string |
| 251 | * @return-taint escaped |
| 252 | * @internal |
| 253 | */ |
| 254 | public static function internalRemoveHtmlTags( string $text, ?callable $processCallback = null, |
| 255 | $args = [], array $extratags = [], array $removetags = [] |
| 256 | ): string { |
| 257 | $tagData = self::getRecognizedTagData( $extratags, $removetags ); |
| 258 | $htmlsingle = $tagData['htmlsingle']; |
| 259 | $htmlsingleonly = $tagData['htmlsingleonly']; |
| 260 | $htmlelements = $tagData['htmlelements']; |
| 261 | |
| 262 | # Remove HTML comments |
| 263 | $text = self::removeHTMLcomments( $text ); |
| 264 | $bits = explode( '<', $text ); |
| 265 | $text = str_replace( '>', '>', array_shift( $bits ) ); |
| 266 | |
| 267 | # this might be possible using remex tidy itself |
| 268 | foreach ( $bits as $x ) { |
| 269 | if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) { |
| 270 | [ /* $qbar */, $slash, $t, $params, $brace, $rest ] = $regs; |
| 271 | |
| 272 | $badtag = false; |
| 273 | $t = strtolower( $t ); |
| 274 | if ( isset( $htmlelements[$t] ) ) { |
| 275 | if ( is_callable( $processCallback ) ) { |
| 276 | $processCallback( $params, $args ); |
| 277 | } |
| 278 | |