Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
62.82% covered (warning)
62.82%
397 / 632
60.42% covered (warning)
60.42%
29 / 48
CRAP
0.00% covered (danger)
0.00%
0 / 1
Sanitizer
62.92% covered (warning)
62.92%
397 / 631
60.42% covered (warning)
60.42%
29 / 48
2051.49
0.00% covered (danger)
0.00%
0 / 1
 getAttribsRegex
18.18% covered (danger)
18.18%
2 / 11
0.00% covered (danger)
0.00%
0 / 1
4.19
 getAttribNameRegex
40.00% covered (danger)
40.00%
2 / 5
0.00% covered (danger)
0.00%
0 / 1
2.86
 getRecognizedTagData
40.00% covered (danger)
40.00%
24 / 60
0.00% covered (danger)
0.00%
0 / 1
21.82
 internalRemoveHtmlTags
96.43% covered (success)
96.43%
27 / 28
0.00% covered (danger)
0.00%
0 / 1
12
 removeSomeTags
100.00% covered (success)
100.00%
28 / 28
100.00% covered (success)
100.00%
1 / 1
1
 removeHTMLcomments
11.76% covered (danger)
11.76%
2 / 17
0.00% covered (danger)
0.00%
0 / 1
51.96
 validateTag
77.78% covered (warning)
77.78%
7 / 9
0.00% covered (danger)
0.00%
0 / 1
8.70
 validateTagAttributes
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 validateAttributes
91.30% covered (success)
91.30%
42 / 46
0.00% covered (danger)
0.00%
0 / 1
36.85
 isReservedDataAttribute
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 mergeAttributes
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
6
 normalizeCss
55.56% covered (warning)
55.56%
10 / 18
0.00% covered (danger)
0.00%
0 / 1
5.40
 checkCss
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
4
 cssDecodeCallback
80.00% covered (warning)
80.00%
8 / 10
0.00% covered (danger)
0.00%
0 / 1
8.51
 fixTagAttributes
85.71% covered (warning)
85.71%
6 / 7
0.00% covered (danger)
0.00%
0 / 1
3.03
 encodeAttribute
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
1
 armorFrenchSpaces
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
1
 safeEncodeAttribute
100.00% covered (success)
100.00%
24 / 24
100.00% covered (success)
100.00%
1 / 1
1
 escapeIdForAttribute
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 escapeIdForLink
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 escapeIdForExternalInterwiki
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 escapeIdInternalUrl
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 escapeIdInternal
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
4
 escapeIdReferenceListInternal
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 escapeClass
0.00% covered (danger)
0.00%
0 / 4
0.00% covered (danger)
0.00%
0 / 1
2
 escapeHtmlAllowEntities
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 decodeTagAttributes
100.00% covered (success)
100.00%
19 / 19
100.00% covered (success)
100.00%
1 / 1
5
 safeEncodeTagAttributes
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 getTagAttributeCallback
88.89% covered (warning)
88.89%
8 / 9
0.00% covered (danger)
0.00%
0 / 1
5.03
 normalizeWhitespace
60.00% covered (warning)
60.00%
3 / 5
0.00% covered (danger)
0.00%
0 / 1
2.26
 normalizeSectionNameWhitespace
60.00% covered (warning)
60.00%
3 / 5
0.00% covered (danger)
0.00%
0 / 1
2.26
 normalizeCharReferences
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 normalizeCharReferencesCallback
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
5
 normalizeEntity
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
4
 decCharReference
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 hexCharReference
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
3
 validateCodepoint
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
10
 decodeCharReferences
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 decodeCharReferencesAndNormalize
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
2
 decodeCharReferencesCallback
90.00% covered (success)
90.00%
9 / 10
0.00% covered (danger)
0.00%
0 / 1
5.03
 decodeChar
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 decodeEntity
75.00% covered (warning)
75.00%
3 / 4
0.00% covered (danger)
0.00%
0 / 1
2.06
 attributesAllowedInternal
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 setupAttributesAllowedInternal
2.26% covered (danger)
2.26%
3 / 133
0.00% covered (danger)
0.00%
0 / 1
5.74
 stripAllTags
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
1
 hackDocType
0.00% covered (danger)
0.00%
0 / 11
0.00% covered (danger)
0.00%
0 / 1
20
 cleanUrl
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
4
 validateEmail
91.67% covered (success)
91.67%
11 / 12
0.00% covered (danger)
0.00%
0 / 1
2.00
1<?php
2/**
3 * HTML sanitizer for %MediaWiki.
4 *
5 * Copyright © 2002-2005 Brooke Vibber <bvibber@wikimedia.org> et al
6 * https://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup Parser
25 */
26
27namespace MediaWiki\Parser;
28
29use InvalidArgumentException;
30use LogicException;
31use MediaWiki\HookContainer\HookRunner;
32use MediaWiki\MediaWikiServices;
33use MediaWiki\Tidy\RemexCompatFormatter;
34use StringUtils;
35use UnexpectedValueException;
36use Wikimedia\RemexHtml\HTMLData;
37use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
38use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
39use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
40use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
41
42/**
43 * HTML sanitizer for MediaWiki
44 * @ingroup Parser
45 */
46class Sanitizer {
47    /**
48     * Regular expression to match various types of character references in
49     * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences.
50     * Note that HTML5 allows some named entities to omit the trailing
51     * semicolon; wikitext entities *must* have a trailing semicolon.
52     */
53    private const CHAR_REFS_REGEX =
54        '/&([A-Za-z0-9\x80-\xff]+;)
55        |&\#([0-9]+);
56        |&\#[xX]([0-9A-Fa-f]+);
57        |&/x';
58
59    /**
60     * Acceptable tag name charset from HTML5 parsing spec
61     * https://www.w3.org/TR/html5/syntax.html#tag-open-state
62     */
63    private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
64
65    /**
66     * Pattern matching evil uris like javascript:
67     * WARNING: DO NOT use this in any place that actually requires denying
68     * certain URIs for security reasons. There are NUMEROUS[1] ways to bypass
69     * pattern-based deny lists; the only way to be secure from javascript:
70     * uri based xss vectors is to allow only things that you know are safe
71     * and deny everything else.
72     * [1]: http://ha.ckers.org/xss.html
73     */
74    private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
75    private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
76
77    /**
78     * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
79     *
80     * @since 1.30
81     */
82    public const ID_PRIMARY = 0;
83
84    /**
85     * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
86     * if no fallback is configured.
87     *
88     * @since 1.30
89     */
90    public const ID_FALLBACK = 1;
91
92    /**
93     * Character entity aliases accepted by MediaWiki in wikitext.
94     * These are not part of the HTML standard.
95     */
96    private const MW_ENTITY_ALIASES = [
97        'רלמ;' => 'rlm;',
98        'رلم;' => 'rlm;',
99    ];
100
101    /**
102     * Lazy-initialised attributes regex, see getAttribsRegex()
103     */
104    private static ?string $attribsRegex = null;
105
106    /**
107     * Regular expression to match HTML/XML attribute pairs within a tag.
108     * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
109     * Used in Sanitizer::decodeTagAttributes
110     */
111    private static function getAttribsRegex(): string {
112        if ( self::$attribsRegex === null ) {
113            $spaceChars = '\x09\x0a\x0c\x0d\x20';
114            $space = "[{$spaceChars}]";
115            $attrib = "[^{$spaceChars}\/>=]";
116            $attribFirst = "(?:{$attrib}|=)";
117            self::$attribsRegex =
118                "/({$attribFirst}{$attrib}*)
119                    ($space*=$space*
120                    (?:
121                        # The attribute value: quoted or alone
122                        \"([^\"]*)(?:\"|\$)
123                        | '([^']*)(?:'|\$)
124                        | (((?!$space|>).)*)
125                    )
126                )?/sxu";
127        }
128        return self::$attribsRegex;
129    }
130
131    /**
132     * Lazy-initialised attribute name regex, see getAttribNameRegex()
133     */
134    private static ?string $attribNameRegex = null;
135
136    /**
137     * Used in Sanitizer::decodeTagAttributes to filter attributes.
138     */
139    private static function getAttribNameRegex(): string {
140        if ( self::$attribNameRegex === null ) {
141            $attribFirst = "[:_\p{L}\p{N}]";
142            $attrib = "[:_\.\-\p{L}\p{N}]";
143            self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
144        }
145        return self::$attribNameRegex;
146    }
147
148    /**
149     * Return the various lists of recognized tags
150     * @param string[] $extratags For any extra tags to include
151     * @param string[] $removetags For any tags (default or extra) to exclude
152     * @return array
153     * @internal
154     */
155    public static function getRecognizedTagData( array $extratags = [], array $removetags = [] ): array {
156        static $commonCase, $staticInitialised = false;
157        $isCommonCase = ( $extratags === [] && $removetags === [] );
158        if ( $staticInitialised && $isCommonCase && $commonCase ) {
159            return $commonCase;
160        }
161
162        static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
163            $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
164
165        if ( !$staticInitialised ) {
166            $htmlpairsStatic = [ # Tags that must be closed
167                'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
168                'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
169                'strike', 'strong', 'tt', 'var', 'div', 'center',
170                'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
171                'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
172                'kbd', 'samp', 'data', 'time', 'mark'
173            ];
174            # These tags can be self-closed. For tags not also on
175            # $htmlsingleonly, a self-closed tag will be emitted as
176            # an empty element (open-tag/close-tag pair).
177            $htmlsingle = [
178                'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
179            ];
180
181            # Elements that cannot have close tags. This is (not coincidentally)
182            # also the list of tags for which the HTML 5 parsing algorithm
183            # requires you to "acknowledge the token's self-closing flag", i.e.
184            # a self-closing tag like <br/> is not an HTML 5 parse error only
185            # for this list.
186            $htmlsingleonly = [
187                'br', 'wbr', 'hr', 'meta', 'link'
188            ];
189
190            $htmlnest = [ # Tags that can be nested--??
191                'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
192                'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
193                'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
194            ];
195            $tabletags = [ # Can only appear inside table, we will close them
196                'td', 'th', 'tr',
197            ];
198            $htmllist = [ # Tags used by list
199                'ul', 'ol',
200            ];
201            $listtags = [ # Tags that can appear in a list
202                'li',
203            ];
204
205            $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
206            $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
207
208            # Convert them all to hashtables for faster lookup
209            $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
210                'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
211            foreach ( $vars as $var ) {
212                $$var = array_fill_keys( $$var, true );
213            }
214            $staticInitialised = true;
215        }
216
217        # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
218        $extratags = array_fill_keys( $extratags, true );
219        $removetags = array_fill_keys( $removetags, true );
220        $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
221        $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
222
223        $result = [
224            'htmlpairs' => $htmlpairs,
225            'htmlsingle' => $htmlsingle,
226            'htmlsingleonly' => $htmlsingleonly,
227            'htmlnest' => $htmlnest,
228            'tabletags' => $tabletags,
229            'htmllist' => $htmllist,
230            'listtags' => $listtags,
231            'htmlsingleallowed' => $htmlsingleallowed,
232            'htmlelements' => $htmlelements,
233        ];
234        if ( $isCommonCase ) {
235            $commonCase = $result;
236        }
237        return $result;
238    }
239
240    /**
241     * Cleans up HTML, removes dangerous tags and attributes, and
242     * removes HTML comments; BEWARE there may be unmatched HTML
243     * tags in the result.
244     *
245     * @note Callers are recommended to use `::removeSomeTags()` instead
246     * of this method.  `Sanitizer::removeSomeTags()` is safer and will
247     * always return well-formed HTML; however, it is significantly
248     * slower (especially for short strings where setup costs
249     * predominate).  This method is for internal use by the legacy parser
250     * where we know the result will be cleaned up in a subsequent tidy pass.
251     *
252     * @param string $text Original string; see T268353 for why untainted.
253     * @param-taint $text none
254     * @param callable|null $processCallback Callback to do any variable or
255     *   parameter replacements in HTML attribute values.
256     *   This argument should be considered @internal.
257     * @param-taint $processCallback exec_shell
258     * @param array|bool $args Arguments for the processing callback
259     * @param-taint $args none
260     * @param array $extratags For any extra tags to include
261     * @param-taint $extratags tainted
262     * @param array $removetags For any tags (default or extra) to exclude
263     * @param-taint $removetags none
264     * @return string
265     * @return-taint escaped
266     * @internal
267     */
268    public static function internalRemoveHtmlTags( string $text, ?callable $processCallback = null,
269        $args = [], array $extratags = [], array $removetags = []
270    ): string {
271        $tagData = self::getRecognizedTagData( $extratags, $removetags );
272        $htmlsingle = $tagData['htmlsingle'];
273        $htmlsingleonly = $tagData['htmlsingleonly'];
274        $htmlelements = $tagData['htmlelements'];
275
276        # Remove HTML comments
277        $text = self::removeHTMLcomments( $text );
278        $bits = explode( '<', $text );
279        $text = str_replace( '>', '&gt;', array_shift( $bits ) );
280
281        # this might be possible using remex tidy itself
282        foreach ( $bits as $x ) {
283            if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
284                [ /* $qbar */, $slash, $t, $params, $brace, $rest ] = $regs;
285