Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
63.04% covered (warning)
63.04%
394 / 625
64.58% covered (warning)
64.58%
31 / 48
CRAP
0.00% covered (danger)
0.00%
0 / 1
Sanitizer
63.14% covered (warning)
63.14%
394 / 624
64.58% covered (warning)
64.58%
31 / 48
1938.11
0.00% covered (danger)
0.00%
0 / 1
 getAttribsRegex
18.18% covered (danger)
18.18%
2 / 11
0.00% covered (danger)
0.00%
0 / 1
4.19
 getAttribNameRegex
40.00% covered (danger)
40.00%
2 / 5
0.00% covered (danger)
0.00%
0 / 1
2.86
 getRecognizedTagData
40.00% covered (danger)
40.00%
24 / 60
0.00% covered (danger)
0.00%
0 / 1
21.82
 internalRemoveHtmlTags
96.43% covered (success)
96.43%
27 / 28
0.00% covered (danger)
0.00%
0 / 1
12
 removeSomeTags
100.00% covered (success)
100.00%
28 / 28
100.00% covered (success)
100.00%
1 / 1
1
 removeHTMLcomments
11.76% covered (danger)
11.76%
2 / 17
0.00% covered (danger)
0.00%
0 / 1
51.96
 validateTag
77.78% covered (warning)
77.78%
7 / 9
0.00% covered (danger)
0.00%
0 / 1
8.70
 validateTagAttributes
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 validateAttributes
90.70% covered (success)
90.70%
39 / 43
0.00% covered (danger)
0.00%
0 / 1
34.93
 isReservedDataAttribute
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 mergeAttributes
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
6
 normalizeCss
55.56% covered (warning)
55.56%
10 / 18
0.00% covered (danger)
0.00%
0 / 1
5.40
 checkCss
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
4
 cssDecodeCallback
80.00% covered (warning)
80.00%
8 / 10
0.00% covered (danger)
0.00%
0 / 1
8.51
 fixTagAttributes
85.71% covered (warning)
85.71%
6 / 7
0.00% covered (danger)
0.00%
0 / 1
3.03
 encodeAttribute
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
1
 armorFrenchSpaces
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
1
 safeEncodeAttribute
100.00% covered (success)
100.00%
23 / 23
100.00% covered (success)
100.00%
1 / 1
1
 escapeIdForAttribute
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 escapeIdForLink
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 escapeIdForExternalInterwiki
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 escapeIdInternalUrl
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 escapeIdInternal
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
4
 escapeIdReferenceListInternal
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 escapeClass
0.00% covered (danger)
0.00%
0 / 4
0.00% covered (danger)
0.00%
0 / 1
2
 escapeHtmlAllowEntities
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 decodeTagAttributes
100.00% covered (success)
100.00%
19 / 19
100.00% covered (success)
100.00%
1 / 1
5
 safeEncodeTagAttributes
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 getTagAttributeCallback
88.89% covered (warning)
88.89%
8 / 9
0.00% covered (danger)
0.00%
0 / 1
5.03
 normalizeWhitespace
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 normalizeSectionNameWhitespace
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 normalizeCharReferences
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 normalizeCharReferencesCallback
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
5
 normalizeEntity
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
4
 decCharReference
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 hexCharReference
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
3
 validateCodepoint
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
10
 decodeCharReferences
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 decodeCharReferencesAndNormalize
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
2
 decodeCharReferencesCallback
90.00% covered (success)
90.00%
9 / 10
0.00% covered (danger)
0.00%
0 / 1
5.03
 decodeChar
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 decodeEntity
75.00% covered (warning)
75.00%
3 / 4
0.00% covered (danger)
0.00%
0 / 1
2.06
 attributesAllowedInternal
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 setupAttributesAllowedInternal
2.26% covered (danger)
2.26%
3 / 133
0.00% covered (danger)
0.00%
0 / 1
5.74
 stripAllTags
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
1
 hackDocType
0.00% covered (danger)
0.00%
0 / 11
0.00% covered (danger)
0.00%
0 / 1
20
 cleanUrl
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
4
 validateEmail
91.67% covered (success)
91.67%
11 / 12
0.00% covered (danger)
0.00%
0 / 1
2.00
1<?php
2/**
3 * HTML sanitizer for %MediaWiki.
4 *
5 * Copyright © 2002-2005 Brooke Vibber <bvibber@wikimedia.org> et al
6 * https://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup Parser
25 */
26
27namespace MediaWiki\Parser;
28
29use InvalidArgumentException;
30use LogicException;
31use MediaWiki\HookContainer\HookRunner;
32use MediaWiki\MediaWikiServices;
33use MediaWiki\Tidy\RemexCompatFormatter;
34use StringUtils;
35use UnexpectedValueException;
36use Wikimedia\RemexHtml\HTMLData;
37use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
38use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
39use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
40use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
41
42/**
43 * HTML sanitizer for MediaWiki
44 * @ingroup Parser
45 */
46class Sanitizer {
47    /**
48     * Regular expression to match various types of character references in
49     * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences.
50     * Note that HTML5 allows some named entities to omit the trailing
51     * semicolon; wikitext entities *must* have a trailing semicolon.
52     */
53    private const CHAR_REFS_REGEX =
54        '/&([A-Za-z0-9\x80-\xff]+;)
55        |&\#([0-9]+);
56        |&\#[xX]([0-9A-Fa-f]+);
57        |&/x';
58
59    /**
60     * Acceptable tag name charset from HTML5 parsing spec
61     * https://www.w3.org/TR/html5/syntax.html#tag-open-state
62     */
63    private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
64
65    /**
66     * Pattern matching evil uris like javascript:
67     * WARNING: DO NOT use this in any place that actually requires denying
68     * certain URIs for security reasons. There are NUMEROUS[1] ways to bypass
69     * pattern-based deny lists; the only way to be secure from javascript:
70     * uri based xss vectors is to allow only things that you know are safe
71     * and deny everything else.
72     * [1]: http://ha.ckers.org/xss.html
73     */
74    private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
75    private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
76
77    /**
78     * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
79     *
80     * @since 1.30
81     */
82    public const ID_PRIMARY = 0;
83
84    /**
85     * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
86     * if no fallback is configured.
87     *
88     * @since 1.30
89     */
90    public const ID_FALLBACK = 1;
91
92    /**
93     * Character entity aliases accepted by MediaWiki in wikitext.
94     * These are not part of the HTML standard.
95     */
96    private const MW_ENTITY_ALIASES = [
97        'רלמ;' => 'rlm;',
98        'رلم;' => 'rlm;',
99    ];
100
101    /**
102     * Lazy-initialised attributes regex, see getAttribsRegex()
103     */
104    private static ?string $attribsRegex = null;
105
106    /**
107     * Regular expression to match HTML/XML attribute pairs within a tag.
108     * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
109     * Used in Sanitizer::decodeTagAttributes
110     */
111    private static function getAttribsRegex(): string {
112        if ( self::$attribsRegex === null ) {
113            $spaceChars = '\x09\x0a\x0c\x0d\x20';
114            $space = "[{$spaceChars}]";
115            $attrib = "[^{$spaceChars}\/>=]";
116            $attribFirst = "(?:{$attrib}|=)";
117            self::$attribsRegex =
118                "/({$attribFirst}{$attrib}*)
119                    ($space*=$space*
120                    (?:
121                        # The attribute value: quoted or alone
122                        \"([^\"]*)(?:\"|\$)
123                        | '([^']*)(?:'|\$)
124                        | (((?!$space|>).)*)
125                    )
126                )?/sxu";
127        }
128        return self::$attribsRegex;
129    }
130
131    /**
132     * Lazy-initialised attribute name regex, see getAttribNameRegex()
133     */
134    private static ?string $attribNameRegex = null;
135
136    /**
137     * Used in Sanitizer::decodeTagAttributes to filter attributes.
138     */
139    private static function getAttribNameRegex(): string {
140        if ( self::$attribNameRegex === null ) {
141            $attribFirst = "[:_\p{L}\p{N}]";
142            $attrib = "[:_\.\-\p{L}\p{N}]";
143            self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
144        }
145        return self::$attribNameRegex;
146    }
147
148    /**
149     * Return the various lists of recognized tags
150     * @param string[] $extratags For any extra tags to include
151     * @param string[] $removetags For any tags (default or extra) to exclude
152     * @return array
153     * @internal
154     */
155    public static function getRecognizedTagData( array $extratags = [], array $removetags = [] ): array {
156        static $commonCase, $staticInitialised = false;