Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
63.44% covered (warning)
63.44%
406 / 640
61.22% covered (warning)
61.22%
30 / 49
CRAP
0.00% covered (danger)
0.00%
0 / 1
Sanitizer
63.54% covered (warning)
63.54%
406 / 639
61.22% covered (warning)
61.22%
30 / 49
1979.18
0.00% covered (danger)
0.00%
0 / 1
 getAttribsRegex
18.18% covered (danger)
18.18%
2 / 11
0.00% covered (danger)
0.00%
0 / 1
4.19
 getAttribNameRegex
40.00% covered (danger)
40.00%
2 / 5
0.00% covered (danger)
0.00%
0 / 1
2.86
 getRecognizedTagData
40.00% covered (danger)
40.00%
24 / 60
0.00% covered (danger)
0.00%
0 / 1
21.82
 internalRemoveHtmlTags
96.43% covered (success)
96.43%
27 / 28
0.00% covered (danger)
0.00%
0 / 1
12
 removeSomeTags
100.00% covered (success)
100.00%
29 / 29
100.00% covered (success)
100.00%
1 / 1
1
 removeHTMLcomments
70.59% covered (warning)
70.59%
12 / 17
0.00% covered (danger)
0.00%
0 / 1
9.63
 validateTag
77.78% covered (warning)
77.78%
7 / 9
0.00% covered (danger)
0.00%
0 / 1
8.70
 validateTagAttributes
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 validateAttributes
91.30% covered (success)
91.30%
42 / 46
0.00% covered (danger)
0.00%
0 / 1
36.85
 isReservedDataAttribute
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 mergeAttributes
0.00% covered (danger)
0.00%
0 / 8
0.00% covered (danger)
0.00%
0 / 1
42
 normalizeCss
61.11% covered (warning)
61.11%
11 / 18
0.00% covered (danger)
0.00%
0 / 1
4.94
 checkCss
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
4
 cssDecodeCallback
80.00% covered (warning)
80.00%
8 / 10
0.00% covered (danger)
0.00%
0 / 1
8.51
 fixTagAttributes
85.71% covered (warning)
85.71%
6 / 7
0.00% covered (danger)
0.00%
0 / 1
3.03
 encodeAttribute
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
1
 armorFrenchSpaces
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
1
 safeEncodeAttribute
100.00% covered (success)
100.00%
24 / 24
100.00% covered (success)
100.00%
1 / 1
1
 escapeIdForAttribute
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 escapeIdForLink
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 escapeIdForExternalInterwiki
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 escapeIdInternalUrl
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 escapeIdInternal
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
4
 escapeIdReferenceListInternal
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 escapeClass
0.00% covered (danger)
0.00%
0 / 4
0.00% covered (danger)
0.00%
0 / 1
2
 escapeCombiningChar
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 escapeHtmlAllowEntities
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 decodeTagAttributes
100.00% covered (success)
100.00%
19 / 19
100.00% covered (success)
100.00%
1 / 1
5
 safeEncodeTagAttributes
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 getTagAttributeCallback
88.89% covered (warning)
88.89%
8 / 9
0.00% covered (danger)
0.00%
0 / 1
5.03
 normalizeWhitespace
60.00% covered (warning)
60.00%
3 / 5
0.00% covered (danger)
0.00%
0 / 1
2.26
 normalizeSectionNameWhitespace
60.00% covered (warning)
60.00%
3 / 5
0.00% covered (danger)
0.00%
0 / 1
2.26
 normalizeCharReferences
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 normalizeCharReferencesCallback
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
5
 normalizeEntity
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
4
 decCharReference
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 hexCharReference
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
3
 validateCodepoint
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
10
 decodeCharReferences
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 decodeCharReferencesAndNormalize
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
2
 decodeCharReferencesCallback
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
5
 decodeChar
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 decodeEntity
75.00% covered (warning)
75.00%
3 / 4
0.00% covered (danger)
0.00%
0 / 1
2.06
 attributesAllowedInternal
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 setupAttributesAllowedInternal
2.21% covered (danger)
2.21%
3 / 136
0.00% covered (danger)
0.00%
0 / 1
5.74
 stripAllTags
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
1
 hackDocType
0.00% covered (danger)
0.00%
0 / 11
0.00% covered (danger)
0.00%
0 / 1
20
 cleanUrl
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
4
 validateEmail
91.67% covered (success)
91.67%
11 / 12
0.00% covered (danger)
0.00%
0 / 1
2.00
1<?php
2/**
3 * HTML sanitizer for %MediaWiki.
4 *
5 * Copyright © 2002-2005 Brooke Vibber <bvibber@wikimedia.org> et al
6 * https://www.mediawiki.org/
7 *
8 * @license GPL-2.0-or-later
9 * @file
10 * @ingroup Parser
11 */
12
13namespace MediaWiki\Parser;
14
15use InvalidArgumentException;
16use LogicException;
17use MediaWiki\HookContainer\HookRunner;
18use MediaWiki\MediaWikiServices;
19use MediaWiki\Tidy\RemexCompatFormatter;
20use UnexpectedValueException;
21use Wikimedia\RemexHtml\HTMLData;
22use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
23use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
24use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
25use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
26use Wikimedia\StringUtils\StringUtils;
27
28/**
29 * HTML sanitizer for MediaWiki
30 * @ingroup Parser
31 */
32class Sanitizer {
33    /**
34     * Regular expression to match various types of character references in
35     * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences.
36     * Note that HTML5 allows some named entities to omit the trailing
37     * semicolon; wikitext entities *must* have a trailing semicolon.
38     */
39    private const CHAR_REFS_REGEX =
40        '/&([A-Za-z0-9\x80-\xff]+;)
41        |&\#([0-9]+);
42        |&\#[xX]([0-9A-Fa-f]+);
43        |&/x';
44
45    /**
46     * Acceptable tag name charset from HTML5 parsing spec
47     * https://www.w3.org/TR/html5/syntax.html#tag-open-state
48     */
49    private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
50
51    /**
52     * Pattern matching evil uris like javascript:
53     * WARNING: DO NOT use this in any place that actually requires denying
54     * certain URIs for security reasons. There are NUMEROUS[1] ways to bypass
55     * pattern-based deny lists; the only way to be secure from javascript:
56     * uri based xss vectors is to allow only things that you know are safe
57     * and deny everything else.
58     * [1]: http://ha.ckers.org/xss.html
59     */
60    private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
61    private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
62
63    /**
64     * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
65     *
66     * @since 1.30
67     */
68    public const ID_PRIMARY = 0;
69
70    /**
71     * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
72     * if no fallback is configured.
73     *
74     * @since 1.30
75     */
76    public const ID_FALLBACK = 1;
77
78    /**
79     * Character entity aliases accepted by MediaWiki in wikitext.
80     * These are not part of the HTML standard.
81     */
82    private const MW_ENTITY_ALIASES = [
83        'רלמ;' => 'rlm;',
84        'رلم;' => 'rlm;',
85    ];
86
87    /**
88     * Lazy-initialised attributes regex, see getAttribsRegex()
89     */
90    private static ?string $attribsRegex = null;
91
92    /**
93     * Regular expression to match HTML/XML attribute pairs within a tag.
94     * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
95     * Used in Sanitizer::decodeTagAttributes
96     */
97    private static function getAttribsRegex(): string {
98        if ( self::$attribsRegex === null ) {
99            $spaceChars = '\x09\x0a\x0c\x0d\x20';
100            $space = "[{$spaceChars}]";
101            $attrib = "[^{$spaceChars}\/>=]";
102            $attribFirst = "(?:{$attrib}|=)";
103            self::$attribsRegex =
104                "/({$attribFirst}{$attrib}*)
105                    ($space*=$space*
106                    (?:
107                        # The attribute value: quoted or alone
108                        \"([^\"]*)(?:\"|\$)
109                        | '([^']*)(?:'|\$)
110                        | (((?!$space|>).)*)
111                    )
112                )?/sxu";
113        }
114        return self::$attribsRegex;
115    }
116
117    /**
118     * Lazy-initialised attribute name regex, see getAttribNameRegex()
119     */
120    private static ?string $attribNameRegex = null;
121
122    /**
123     * Used in Sanitizer::decodeTagAttributes to filter attributes.
124     */
125    private static function getAttribNameRegex(): string {
126        if ( self::$attribNameRegex === null ) {
127            $attribFirst = "[:_\p{L}\p{N}]";
128            $attrib = "[:_\.\-\p{L}\p{N}]";
129            self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
130        }
131        return self::$attribNameRegex;
132    }
133
134    /**
135     * Return the various lists of recognized tags
136     * @param string[] $extratags For any extra tags to include
137     * @param string[] $removetags For any tags (default or extra) to exclude
138     * @return array
139     * @internal
140     */
141    public static function getRecognizedTagData( array $extratags = [], array $removetags = [] ): array {
142        static $commonCase, $staticInitialised = false;
143        $isCommonCase = ( $extratags === [] && $removetags === [] );
144        if ( $staticInitialised && $isCommonCase && $commonCase ) {
145            return $commonCase;
146        }
147
148        static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
149            $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
150
151        if ( !$staticInitialised ) {
152            $htmlpairsStatic = [ # Tags that must be closed
153                'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
154                'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
155                'strike', 'strong', 'tt', 'var', 'div', 'center',
156                'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
157                'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
158                'kbd', 'samp', 'data', 'time', 'mark'
159            ];
160            # These tags can be self-closed. For tags not also on
161            # $htmlsingleonly, a self-closed tag will be emitted as
162            # an empty element (open-tag/close-tag pair).
163            $htmlsingle = [
164                'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
165            ];
166
167            # Elements that cannot have close tags. This is (not coincidentally)
168            # also the list of tags for which the HTML 5 parsing algorithm
169            # requires you to "acknowledge the token's self-closing flag", i.e.
170            # a self-closing tag like <br/> is not an HTML 5 parse error only
171            # for this list.
172            $htmlsingleonly = [
173                'br', 'wbr', 'hr', 'meta', 'link'
174            ];
175
176            $htmlnest = [ # Tags that can be nested--??
177                'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
178                'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
179                'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
180            ];
181            $tabletags = [ # Can only appear inside table, we will close them
182                'td', 'th', 'tr',
183            ];
184            $htmllist = [ # Tags used by list
185                'ul', 'ol',
186            ];
187            $listtags = [ # Tags that can appear in a list
188                'li',
189            ];
190
191            $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
192            $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
193
194            # Convert them all to hashtables for faster lookup
195            $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
196                'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
197            foreach ( $vars as $var ) {
198                $$var = array_fill_keys( $$var, true );
199            }
200            $staticInitialised = true;
201        }
202
203        # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
204        $extratags = array_fill_keys( $extratags, true );
205        $removetags = array_fill_keys( $removetags, true );
206        $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
207        $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
208
209        $result = [
210            'htmlpairs' => $htmlpairs,
211            'htmlsingle' => $htmlsingle,
212            'htmlsingleonly' => $htmlsingleonly,
213            'htmlnest' => $htmlnest,
214            'tabletags' => $tabletags,
215            'htmllist' => $htmllist,
216            'listtags' => $listtags,
217            'htmlsingleallowed' => $htmlsingleallowed,
218            'htmlelements' => $htmlelements,
219        ];
220        if ( $isCommonCase ) {
221            $commonCase = $result;
222        }
223        return $result;
224    }
225
226    /**
227     * Cleans up HTML, removes dangerous tags and attributes, and
228     * removes HTML comments; BEWARE there may be unmatched HTML
229     * tags in the result.
230     *
231     * @note Callers are recommended to use `::removeSomeTags()` instead
232     * of this method.  `Sanitizer::removeSomeTags()` is safer and will
233     * always return well-formed HTML; however, it is significantly
234     * slower (especially for short strings where setup costs
235     * predominate).  This method is for internal use by the legacy parser
236     * where we know the result will be cleaned up in a subsequent tidy pass.
237     *
238     * @param string $text Original string; see T268353 for why untainted.
239     * @param-taint $text none
240     * @param callable|null $processCallback Callback to do any variable or
241     *   parameter replacements in HTML attribute values.
242     *   This argument should be considered @internal.
243     * @param-taint $processCallback exec_shell
244     * @param array|bool $args Arguments for the processing callback
245     * @param-taint $args none
246     * @param array $extratags For any extra tags to include
247     * @param-taint $extratags tainted
248     * @param array $removetags For any tags (default or extra) to exclude
249     * @param-taint $removetags none
250     * @return string
251     * @return-taint escaped
252     * @internal
253     */
254    public static function internalRemoveHtmlTags( string $text, ?callable $processCallback = null,
255        $args = [], array $extratags = [], array $removetags = []
256    ): string {
257        $tagData = self::getRecognizedTagData( $extratags, $removetags );
258        $htmlsingle = $tagData['htmlsingle'];
259        $htmlsingleonly = $tagData['htmlsingleonly'];
260        $htmlelements = $tagData['htmlelements'];
261
262        # Remove HTML comments
263        $text = self::removeHTMLcomments( $text );
264        $bits = explode( '<', $text );
265        $text = str_replace( '>', '&gt;', array_shift( $bits ) );
266
267        # this might be possible using remex tidy itself
268        foreach ( $bits as $x ) {
269            if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
270                [ /* $qbar */, $slash, $t, $params, $brace, $rest ] = $regs;
271
272                $badtag = false;
273                $t = strtolower( $t );
274                if ( isset( $htmlelements[$t] ) ) {
275                    if ( is_callable( $processCallback ) ) {
276                        $processCallback( $params, $args );
277                    }
278