Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
62.82% |
397 / 632 |
|
60.42% |
29 / 48 |
CRAP | |
0.00% |
0 / 1 |
Sanitizer | |
62.92% |
397 / 631 |
|
60.42% |
29 / 48 |
2051.49 | |
0.00% |
0 / 1 |
getAttribsRegex | |
18.18% |
2 / 11 |
|
0.00% |
0 / 1 |
4.19 | |||
getAttribNameRegex | |
40.00% |
2 / 5 |
|
0.00% |
0 / 1 |
2.86 | |||
getRecognizedTagData | |
40.00% |
24 / 60 |
|
0.00% |
0 / 1 |
21.82 | |||
internalRemoveHtmlTags | |
96.43% |
27 / 28 |
|
0.00% |
0 / 1 |
12 | |||
removeSomeTags | |
100.00% |
28 / 28 |
|
100.00% |
1 / 1 |
1 | |||
removeHTMLcomments | |
11.76% |
2 / 17 |
|
0.00% |
0 / 1 |
51.96 | |||
validateTag | |
77.78% |
7 / 9 |
|
0.00% |
0 / 1 |
8.70 | |||
validateTagAttributes | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
validateAttributes | |
91.30% |
42 / 46 |
|
0.00% |
0 / 1 |
36.85 | |||
isReservedDataAttribute | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
mergeAttributes | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
6 | |||
normalizeCss | |
55.56% |
10 / 18 |
|
0.00% |
0 / 1 |
5.40 | |||
checkCss | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
cssDecodeCallback | |
80.00% |
8 / 10 |
|
0.00% |
0 / 1 |
8.51 | |||
fixTagAttributes | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
3.03 | |||
encodeAttribute | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
armorFrenchSpaces | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
safeEncodeAttribute | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
1 | |||
escapeIdForAttribute | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
escapeIdForLink | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
escapeIdForExternalInterwiki | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
escapeIdInternalUrl | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
escapeIdInternal | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
4 | |||
escapeIdReferenceListInternal | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
escapeClass | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
escapeHtmlAllowEntities | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
decodeTagAttributes | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
5 | |||
safeEncodeTagAttributes | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
getTagAttributeCallback | |
88.89% |
8 / 9 |
|
0.00% |
0 / 1 |
5.03 | |||
normalizeWhitespace | |
60.00% |
3 / 5 |
|
0.00% |
0 / 1 |
2.26 | |||
normalizeSectionNameWhitespace | |
60.00% |
3 / 5 |
|
0.00% |
0 / 1 |
2.26 | |||
normalizeCharReferences | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
normalizeCharReferencesCallback | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
5 | |||
normalizeEntity | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
decCharReference | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
hexCharReference | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
validateCodepoint | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
10 | |||
decodeCharReferences | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
decodeCharReferencesAndNormalize | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
2 | |||
decodeCharReferencesCallback | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
5.03 | |||
decodeChar | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
decodeEntity | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 | |||
attributesAllowedInternal | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
setupAttributesAllowedInternal | |
2.26% |
3 / 133 |
|
0.00% |
0 / 1 |
5.74 | |||
stripAllTags | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
hackDocType | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
cleanUrl | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
4 | |||
validateEmail | |
91.67% |
11 / 12 |
|
0.00% |
0 / 1 |
2.00 |
1 | <?php |
2 | /** |
3 | * HTML sanitizer for %MediaWiki. |
4 | * |
5 | * Copyright © 2002-2005 Brooke Vibber <bvibber@wikimedia.org> et al |
6 | * https://www.mediawiki.org/ |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. |
12 | * |
13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU General Public License along |
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21 | * http://www.gnu.org/copyleft/gpl.html |
22 | * |
23 | * @file |
24 | * @ingroup Parser |
25 | */ |
26 | |
27 | namespace MediaWiki\Parser; |
28 | |
29 | use InvalidArgumentException; |
30 | use LogicException; |
31 | use MediaWiki\HookContainer\HookRunner; |
32 | use MediaWiki\MediaWikiServices; |
33 | use MediaWiki\Tidy\RemexCompatFormatter; |
34 | use StringUtils; |
35 | use UnexpectedValueException; |
36 | use Wikimedia\RemexHtml\HTMLData; |
37 | use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer; |
38 | use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer; |
39 | use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher; |
40 | use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder; |
41 | |
42 | /** |
43 | * HTML sanitizer for MediaWiki |
44 | * @ingroup Parser |
45 | */ |
46 | class Sanitizer { |
47 | /** |
48 | * Regular expression to match various types of character references in |
49 | * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences. |
50 | * Note that HTML5 allows some named entities to omit the trailing |
51 | * semicolon; wikitext entities *must* have a trailing semicolon. |
52 | */ |
53 | private const CHAR_REFS_REGEX = |
54 | '/&([A-Za-z0-9\x80-\xff]+;) |
55 | |&\#([0-9]+); |
56 | |&\#[xX]([0-9A-Fa-f]+); |
57 | |&/x'; |
58 | |
59 | /** |
60 | * Acceptable tag name charset from HTML5 parsing spec |
61 | * https://www.w3.org/TR/html5/syntax.html#tag-open-state |
62 | */ |
63 | private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!'; |
64 | |
65 | /** |
66 | * Pattern matching evil uris like javascript: |
67 | * WARNING: DO NOT use this in any place that actually requires denying |
68 | * certain URIs for security reasons. There are NUMEROUS[1] ways to bypass |
69 | * pattern-based deny lists; the only way to be secure from javascript: |
70 | * uri based xss vectors is to allow only things that you know are safe |
71 | * and deny everything else. |
72 | * [1]: http://ha.ckers.org/xss.html |
73 | */ |
74 | private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; |
75 | private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; |
76 | |
77 | /** |
78 | * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding. |
79 | * |
80 | * @since 1.30 |
81 | */ |
82 | public const ID_PRIMARY = 0; |
83 | |
84 | /** |
85 | * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false |
86 | * if no fallback is configured. |
87 | * |
88 | * @since 1.30 |
89 | */ |
90 | public const ID_FALLBACK = 1; |
91 | |
92 | /** |
93 | * Character entity aliases accepted by MediaWiki in wikitext. |
94 | * These are not part of the HTML standard. |
95 | */ |
96 | private const MW_ENTITY_ALIASES = [ |
97 | 'רלמ;' => 'rlm;', |
98 | 'رلم;' => 'rlm;', |
99 | ]; |
100 | |
101 | /** |
102 | * Lazy-initialised attributes regex, see getAttribsRegex() |
103 | */ |
104 | private static ?string $attribsRegex = null; |
105 | |
106 | /** |
107 | * Regular expression to match HTML/XML attribute pairs within a tag. |
108 | * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state |
109 | * Used in Sanitizer::decodeTagAttributes |
110 | */ |
111 | private static function getAttribsRegex(): string { |
112 | if ( self::$attribsRegex === null ) { |
113 | $spaceChars = '\x09\x0a\x0c\x0d\x20'; |
114 | $space = "[{$spaceChars}]"; |
115 | $attrib = "[^{$spaceChars}\/>=]"; |
116 | $attribFirst = "(?:{$attrib}|=)"; |
117 | self::$attribsRegex = |
118 | "/({$attribFirst}{$attrib}*) |
119 | ($space*=$space* |
120 | (?: |
121 | # The attribute value: quoted or alone |
122 | \"([^\"]*)(?:\"|\$) |
123 | | '([^']*)(?:'|\$) |
124 | | (((?!$space|>).)*) |
125 | ) |
126 | )?/sxu"; |
127 | } |
128 | return self::$attribsRegex; |
129 | } |
130 | |
131 | /** |
132 | * Lazy-initialised attribute name regex, see getAttribNameRegex() |
133 | */ |
134 | private static ?string $attribNameRegex = null; |
135 | |
136 | /** |
137 | * Used in Sanitizer::decodeTagAttributes to filter attributes. |
138 | */ |
139 | private static function getAttribNameRegex(): string { |
140 | if ( self::$attribNameRegex === null ) { |
141 | $attribFirst = "[:_\p{L}\p{N}]"; |
142 | $attrib = "[:_\.\-\p{L}\p{N}]"; |
143 | self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu"; |
144 | } |
145 | return self::$attribNameRegex; |
146 | } |
147 | |
148 | /** |
149 | * Return the various lists of recognized tags |
150 | * @param string[] $extratags For any extra tags to include |
151 | * @param string[] $removetags For any tags (default or extra) to exclude |
152 | * @return array |
153 | * @internal |
154 | */ |
155 | public static function getRecognizedTagData( array $extratags = [], array $removetags = [] ): array { |
156 | static $commonCase, $staticInitialised = false; |
157 | $isCommonCase = ( $extratags === [] && $removetags === [] ); |
158 | if ( $staticInitialised && $isCommonCase && $commonCase ) { |
159 | return $commonCase; |
160 | } |
161 | |
162 | static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, |
163 | $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic; |
164 | |
165 | if ( !$staticInitialised ) { |
166 | $htmlpairsStatic = [ # Tags that must be closed |
167 | 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', |
168 | 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', |
169 | 'strike', 'strong', 'tt', 'var', 'div', 'center', |
170 | 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', |
171 | 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn', |
172 | 'kbd', 'samp', 'data', 'time', 'mark' |
173 | ]; |
174 | # These tags can be self-closed. For tags not also on |
175 | # $htmlsingleonly, a self-closed tag will be emitted as |
176 | # an empty element (open-tag/close-tag pair). |
177 | $htmlsingle = [ |
178 | 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link' |
179 | ]; |
180 | |
181 | # Elements that cannot have close tags. This is (not coincidentally) |
182 | # also the list of tags for which the HTML 5 parsing algorithm |
183 | # requires you to "acknowledge the token's self-closing flag", i.e. |
184 | # a self-closing tag like <br/> is not an HTML 5 parse error only |
185 | # for this list. |
186 | $htmlsingleonly = [ |
187 | 'br', 'wbr', 'hr', 'meta', 'link' |
188 | ]; |
189 | |
190 | $htmlnest = [ # Tags that can be nested--?? |
191 | 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', |
192 | 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', |
193 | 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo' |
194 | ]; |
195 | $tabletags = [ # Can only appear inside table, we will close them |
196 | 'td', 'th', 'tr', |
197 | ]; |
198 | $htmllist = [ # Tags used by list |
199 | 'ul', 'ol', |
200 | ]; |
201 | $listtags = [ # Tags that can appear in a list |
202 | 'li', |
203 | ]; |
204 | |
205 | $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); |
206 | $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); |
207 | |
208 | # Convert them all to hashtables for faster lookup |
209 | $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', |
210 | 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ]; |
211 | foreach ( $vars as $var ) { |
212 | $$var = array_fill_keys( $$var, true ); |
213 | } |
214 | $staticInitialised = true; |
215 | } |
216 | |
217 | # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays |
218 | $extratags = array_fill_keys( $extratags, true ); |
219 | $removetags = array_fill_keys( $removetags, true ); |
220 | $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); |
221 | $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags ); |
222 | |
223 | $result = [ |
224 | 'htmlpairs' => $htmlpairs, |
225 | 'htmlsingle' => $htmlsingle, |
226 | 'htmlsingleonly' => $htmlsingleonly, |
227 | 'htmlnest' => $htmlnest, |
228 | 'tabletags' => $tabletags, |
229 | 'htmllist' => $htmllist, |
230 | 'listtags' => $listtags, |
231 | 'htmlsingleallowed' => $htmlsingleallowed, |
232 | 'htmlelements' => $htmlelements, |
233 | ]; |
234 | if ( $isCommonCase ) { |
235 | $commonCase = $result; |
236 | } |
237 | return $result; |
238 | } |
239 | |
240 | /** |
241 | * Cleans up HTML, removes dangerous tags and attributes, and |
242 | * removes HTML comments; BEWARE there may be unmatched HTML |
243 | * tags in the result. |
244 | * |
245 | * @note Callers are recommended to use `::removeSomeTags()` instead |
246 | * of this method. `Sanitizer::removeSomeTags()` is safer and will |
247 | * always return well-formed HTML; however, it is significantly |
248 | * slower (especially for short strings where setup costs |
249 | * predominate). This method is for internal use by the legacy parser |
250 | * where we know the result will be cleaned up in a subsequent tidy pass. |
251 | * |
252 | * @param string $text Original string; see T268353 for why untainted. |
253 | * @param-taint $text none |
254 | * @param callable|null $processCallback Callback to do any variable or |
255 | * parameter replacements in HTML attribute values. |
256 | * This argument should be considered @internal. |
257 | * @param-taint $processCallback exec_shell |
258 | * @param array|bool $args Arguments for the processing callback |
259 | * @param-taint $args none |
260 | * @param array $extratags For any extra tags to include |
261 | * @param-taint $extratags tainted |
262 | * @param array $removetags For any tags (default or extra) to exclude |
263 | * @param-taint $removetags none |
264 | * @return string |
265 | * @return-taint escaped |
266 | * @internal |
267 | */ |
268 | public static function internalRemoveHtmlTags( string $text, ?callable $processCallback = null, |
269 | $args = [], array $extratags = [], array $removetags = [] |
270 | ): string { |
271 | $tagData = self::getRecognizedTagData( $extratags, $removetags ); |
272 | $htmlsingle = $tagData['htmlsingle']; |
273 | $htmlsingleonly = $tagData['htmlsingleonly']; |
274 | $htmlelements = $tagData['htmlelements']; |
275 | |
276 | # Remove HTML comments |
277 | $text = self::removeHTMLcomments( $text ); |
278 | $bits = explode( '<', $text ); |
279 | $text = str_replace( '>', '>', array_shift( $bits ) ); |
280 | |
281 | # this might be possible using remex tidy itself |
282 | foreach ( $bits as $x ) { |
283 | if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) { |
284 | [ /* $qbar */, $slash, $t, $params, $brace, $rest ] = $regs; |
285 | |