Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
63.28% |
398 / 629 |
|
65.31% |
32 / 49 |
CRAP | |
0.00% |
0 / 1 |
Sanitizer | |
63.38% |
398 / 628 |
|
65.31% |
32 / 49 |
1924.28 | |
0.00% |
0 / 1 |
getAttribsRegex | |
18.18% |
2 / 11 |
|
0.00% |
0 / 1 |
4.19 | |||
getAttribNameRegex | |
40.00% |
2 / 5 |
|
0.00% |
0 / 1 |
2.86 | |||
getRecognizedTagData | |
40.00% |
24 / 60 |
|
0.00% |
0 / 1 |
21.82 | |||
removeHTMLtags | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
internalRemoveHtmlTags | |
96.43% |
27 / 28 |
|
0.00% |
0 / 1 |
12 | |||
removeSomeTags | |
100.00% |
28 / 28 |
|
100.00% |
1 / 1 |
1 | |||
removeHTMLcomments | |
11.76% |
2 / 17 |
|
0.00% |
0 / 1 |
51.96 | |||
validateTag | |
77.78% |
7 / 9 |
|
0.00% |
0 / 1 |
8.70 | |||
validateTagAttributes | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
validateAttributes | |
90.70% |
39 / 43 |
|
0.00% |
0 / 1 |
34.93 | |||
isReservedDataAttribute | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
mergeAttributes | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
6 | |||
normalizeCss | |
55.56% |
10 / 18 |
|
0.00% |
0 / 1 |
5.40 | |||
checkCss | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
cssDecodeCallback | |
80.00% |
8 / 10 |
|
0.00% |
0 / 1 |
8.51 | |||
fixTagAttributes | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
3.03 | |||
encodeAttribute | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
armorFrenchSpaces | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
safeEncodeAttribute | |
100.00% |
23 / 23 |
|
100.00% |
1 / 1 |
1 | |||
escapeIdForAttribute | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
escapeIdForLink | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
escapeIdForExternalInterwiki | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
escapeIdInternalUrl | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
escapeIdInternal | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
4 | |||
escapeIdReferenceListInternal | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
escapeClass | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
escapeHtmlAllowEntities | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
decodeTagAttributes | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
5 | |||
safeEncodeTagAttributes | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
getTagAttributeCallback | |
88.89% |
8 / 9 |
|
0.00% |
0 / 1 |
5.03 | |||
normalizeWhitespace | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
normalizeSectionNameWhitespace | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
normalizeCharReferences | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
normalizeCharReferencesCallback | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
5 | |||
normalizeEntity | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
decCharReference | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
hexCharReference | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
validateCodepoint | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
10 | |||
decodeCharReferences | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
decodeCharReferencesAndNormalize | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
2 | |||
decodeCharReferencesCallback | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
5.03 | |||
decodeChar | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
decodeEntity | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 | |||
attributesAllowedInternal | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
setupAttributesAllowedInternal | |
2.26% |
3 / 133 |
|
0.00% |
0 / 1 |
5.74 | |||
stripAllTags | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
hackDocType | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
cleanUrl | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
4 | |||
validateEmail | |
91.67% |
11 / 12 |
|
0.00% |
0 / 1 |
2.00 |
1 | <?php |
2 | /** |
3 | * HTML sanitizer for %MediaWiki. |
4 | * |
5 | * Copyright © 2002-2005 Brooke Vibber <bvibber@wikimedia.org> et al |
6 | * https://www.mediawiki.org/ |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. |
12 | * |
13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU General Public License along |
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21 | * http://www.gnu.org/copyleft/gpl.html |
22 | * |
23 | * @file |
24 | * @ingroup Parser |
25 | */ |
26 | |
27 | namespace MediaWiki\Parser; |
28 | |
29 | use InvalidArgumentException; |
30 | use LogicException; |
31 | use MediaWiki\HookContainer\HookRunner; |
32 | use MediaWiki\MediaWikiServices; |
33 | use MediaWiki\Tidy\RemexCompatFormatter; |
34 | use StringUtils; |
35 | use UnexpectedValueException; |
36 | use Wikimedia\RemexHtml\HTMLData; |
37 | use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer; |
38 | use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer; |
39 | use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher; |
40 | use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder; |
41 | |
42 | /** |
43 | * HTML sanitizer for MediaWiki |
44 | * @ingroup Parser |
45 | */ |
46 | class Sanitizer { |
47 | /** |
48 | * Regular expression to match various types of character references in |
49 | * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences. |
50 | * Note that HTML5 allows some named entities to omit the trailing |
51 | * semicolon; wikitext entities *must* have a trailing semicolon. |
52 | */ |
53 | private const CHAR_REFS_REGEX = |
54 | '/&([A-Za-z0-9\x80-\xff]+;) |
55 | |&\#([0-9]+); |
56 | |&\#[xX]([0-9A-Fa-f]+); |
57 | |&/x'; |
58 | |
59 | /** |
60 | * Acceptable tag name charset from HTML5 parsing spec |
61 | * https://www.w3.org/TR/html5/syntax.html#tag-open-state |
62 | */ |
63 | private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!'; |
64 | |
65 | /** |
66 | * Pattern matching evil uris like javascript: |
67 | * WARNING: DO NOT use this in any place that actually requires denying |
68 | * certain URIs for security reasons. There are NUMEROUS[1] ways to bypass |
69 | * pattern-based deny lists; the only way to be secure from javascript: |
70 | * uri based xss vectors is to allow only things that you know are safe |
71 | * and deny everything else. |
72 | * [1]: http://ha.ckers.org/xss.html |
73 | */ |
74 | private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; |
75 | private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; |
76 | |
77 | /** |
78 | * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding. |
79 | * |
80 | * @since 1.30 |
81 | */ |
82 | public const ID_PRIMARY = 0; |
83 | |
84 | /** |
85 | * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false |
86 | * if no fallback is configured. |
87 | * |
88 | * @since 1.30 |
89 | */ |
90 | public const ID_FALLBACK = 1; |
91 | |
92 | /** |
93 | * Character entity aliases accepted by MediaWiki in wikitext. |
94 | * These are not part of the HTML standard. |
95 | */ |
96 | private const MW_ENTITY_ALIASES = [ |
97 | 'רלמ;' => 'rlm;', |
98 | 'رلم;' => 'rlm;', |
99 | ]; |
100 | |
101 | /** |
102 | * Lazy-initialised attributes regex, see getAttribsRegex() |
103 | */ |
104 | private static ?string $attribsRegex = null; |
105 | |
106 | /** |
107 | * Regular expression to match HTML/XML attribute pairs within a tag. |
108 | * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state |
109 | * Used in Sanitizer::decodeTagAttributes |
110 | */ |
111 | private static function getAttribsRegex(): string { |
112 | if ( self::$attribsRegex === null ) { |
113 | $spaceChars = '\x09\x0a\x0c\x0d\x20'; |
114 | $space = "[{$spaceChars}]"; |
115 | $attrib = "[^{$spaceChars}\/>=]"; |
116 | $attribFirst = "(?:{$attrib}|=)"; |
117 | self::$attribsRegex = |
118 | "/({$attribFirst}{$attrib}*) |
119 | ($space*=$space* |
120 | (?: |
121 | # The attribute value: quoted or alone |
122 | \"([^\"]*)(?:\"|\$) |
123 | | '([^']*)(?:'|\$) |
124 | | (((?!$space|>).)*) |
125 | ) |
126 | )?/sxu"; |
127 | } |
128 | return self::$attribsRegex; |
129 | } |
130 | |
131 | /** |
132 | * Lazy-initialised attribute name regex, see getAttribNameRegex() |
133 | */ |
134 | private static ?string $attribNameRegex = null; |
135 | |
136 | /** |
137 | * Used in Sanitizer::decodeTagAttributes to filter attributes. |
138 | */ |
139 | private static function getAttribNameRegex(): string { |
140 | if ( self::$attribNameRegex === null ) { |
141 | $attribFirst = "[:_\p{L}\p{N}]"; |
142 | $attrib = "[:_\.\-\p{L}\p{N}]"; |
143 | self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu"; |
144 | } |
145 | return self::$attribNameRegex; |
146 | } |
147 | |
148 | /** |
149 | * Return the various lists of recognized tags |
150 | * @param string[] $extratags For any extra tags to include |
151 | * @param string[] $removetags For any tags (default or extra) to exclude |
152 | * @return array |
153 | * @internal |
154 | */ |
155 | public static function getRecognizedTagData( array $extratags = [], array $removetags = [] ): array { |
156 | static $commonCase, $staticInitialised = false; |
157 | $isCommonCase = ( $extratags === [] && $removetags === [] ); |
158 | if ( $staticInitialised && $isCommonCase && $commonCase ) { |
159 | return $commonCase; |
160 | } |
161 | |
162 | static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, |
163 | $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic; |
164 | |
165 | if ( !$staticInitialised ) { |
166 | $htmlpairsStatic = [ # Tags that must be closed |
167 | 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', |
168 | 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', |
169 | 'strike', 'strong', 'tt', 'var', 'div', 'center', |
170 | 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', |
171 | 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn', |
172 | 'kbd', 'samp', 'data', 'time', 'mark' |
173 | ]; |
174 | # These tags can be self-closed. For tags not also on |
175 | # $htmlsingleonly, a self-closed tag will be emitted as |
176 | # an empty element (open-tag/close-tag pair). |
177 | $htmlsingle = [ |
178 | 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link' |
179 | ]; |
180 | |
181 | # Elements that cannot have close tags. This is (not coincidentally) |
182 | # also the list of tags for which the HTML 5 parsing algorithm |
183 | # requires you to "acknowledge the token's self-closing flag", i.e. |
184 | # a self-closing tag like <br/> is not an HTML 5 parse error only |
185 | # for this list. |
186 | $htmlsingleonly = [ |
187 | 'br', 'wbr', 'hr', 'meta', 'link' |
188 | ]; |
189 | |
190 | $htmlnest = [ # Tags that can be nested--?? |
191 | 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', |
192 | 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', |
193 | 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo' |
194 | ]; |
195 | $tabletags = [ # Can only appear inside table, we will close them |
196 | 'td', 'th', 'tr', |
197 | ]; |
198 | $htmllist = [ # Tags used by list |
199 | 'ul', 'ol', |
200 | ]; |
201 | $listtags = [ # Tags that can appear in a list |
202 | 'li', |
203 | ]; |
204 | |
205 | $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); |
206 | $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); |
207 | |
208 | # Convert them all to hashtables for faster lookup |
209 | $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', |
210 | 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ]; |
211 | foreach ( $vars as $var ) { |
212 | $$var = array_fill_keys( $$var, true ); |
213 | } |
214 | $staticInitialised = true; |
215 | } |
216 | |
217 | # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays |
218 | $extratags = array_fill_keys( $extratags, true ); |
219 | $removetags = array_fill_keys( $removetags, true ); |
220 | $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); |
221 | $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags ); |
222 | |
223 | $result = [ |
224 | 'htmlpairs' => $htmlpairs, |
225 | 'htmlsingle' => $htmlsingle, |
226 | 'htmlsingleonly' => $htmlsingleonly, |
227 | 'htmlnest' => $htmlnest, |
228 | 'tabletags' => $tabletags, |
229 | 'htmllist' => $htmllist, |
230 | 'listtags' => $listtags, |
231 | 'htmlsingleallowed' => $htmlsingleallowed, |
232 | 'htmlelements' => $htmlelements, |
233 | ]; |
234 | if ( $isCommonCase ) { |
235 | $commonCase = $result; |
236 | } |
237 | return $result; |
238 | } |
239 | |
240 | /** |
241 | * Cleans up HTML, removes dangerous tags and attributes, and |
242 | * removes HTML comments; BEWARE there may be unmatched HTML |
243 | * tags in the result. |
244 | * |
245 | * @note Callers are recommended to use `::removeSomeTags()` |
246 | * instead of this method. `Sanitizer::removeSomeTags()` is safer |
247 | * and will always return well-formed HTML; however, it is |
248 | * significantly slower (especially for short strings where setup |
249 | * costs predominate). This method, although faster, should only |
250 | * be used where we know the result be cleaned up in a subsequent |
251 | * tidy pass. |
252 | * |
253 | * @param string $text Original string; see T268353 for why untainted. |
254 | * @param-taint $text none |
255 | * @param callable|null $processCallback Callback to do any variable or |
256 | * parameter replacements in HTML attribute values. |
257 | * This argument should be considered @internal. |
258 | * @param-taint $processCallback exec_shell |
259 | * @param array|bool $args Arguments for the processing callback |
260 | * @param-taint $args none |
261 | * @param array $extratags For any extra tags to include |
262 | * @param-taint $extratags tainted |
263 | * @param array $removetags For any tags (default or extra) to exclude |
264 | * @param-taint $removetags none |
265 | * @return string |
266 | * @return-taint escaped |
267 | * @deprecated since 1.38. Use ::removeSomeTags(), which always gives |
268 | * balanced/tidy HTML. |
269 | */ |
270 | public static function removeHTMLtags( string $text, ?callable $processCallback = null, |
271 | $args = [], array $extratags = [], array $removetags = [] |
272 | ): string { |
273 | wfDeprecated( __METHOD__, '1.38' ); |
274 | return self::internalRemoveHtmlTags( |
275 | $text, $processCallback, $args, $extratags, $removetags |
276 | ); |
277 | } |
278 | |
279 | /** |
280 | * Cleans up HTML, removes dangerous tags and attributes, and |
281 | * removes HTML comments; BEWARE there may be unmatched HTML |
282 | * tags in the result. |
283 | * |
284 | * @note Callers are recommended to use `::removeSomeTags()` instead |
285 | * of this method. `Sanitizer::removeSomeTags()` is safer and will |
286 | * always return well-formed HTML; however, it is significantly |
287 | * slower (especially for short strings where setup costs |
288 | * predominate). This method is for internal use by the legacy parser |
289 | * where we know the result will be cleaned up in a subsequent tidy pass. |
290 | * |
291 | * @param string $text Original string; see T268353 for why untainted. |
292 | * @param-taint $text none |
293 | * @param callable|null $processCallback Callback to do any variable or |
294 | * parameter replacements in HTML attribute values. |
295 | * This argument should be considered @internal. |
296 | * @param-taint $processCallback exec_shell |
297 | * @param array|bool $args Arguments for the processing callback |
298 | * @param-taint $args none |
299 | * @param array $extratags For any extra tags to include |
300 | * @param-taint $extratags tainted |
301 | * @param array $removetags For any tags (default or extra) to exclude |
302 | * @param-taint $removetags none |
303 | * @return string |
304 | * @return-taint escaped |
305 | * @internal |
306 | */ |
307 | public static function internalRemoveHtmlTags( string $text, ?callable $processCallback = null, |
308 | $args = [], array $extratags = [], array $removetags = [] |
309 | ): string { |
310 | $tagData = self::getRecognizedTagData( $extratags, $removetags ); |
311 | $htmlsingle = $tagData['htmlsingle']; |
312 | $htmlsingleonly = $tagData['htmlsingleonly']; |
313 | $htmlelements = $tagData['htmlelements']; |
314 | |
315 | # Remove HTML comments |
316 | $text = self::removeHTMLcomments( $text ); |
317 | $bits = explode( '<', $text ); |
318 | $text = str_replace( '>', '>', array_shift( $bits ) ); |
319 | |
320 | # this might be possible using remex tidy itself |
321 | foreach ( $bits as $x ) { |
322 | if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) { |
323 | [ /* $qbar */, $slash, $t, $params, $brace, $rest ] = $regs; |
324 | |
325 | $badtag = false; |
326 | $t = strtolower( $t ); |
327 | if ( isset( $htmlelements[$t] ) ) { |
328 | if ( is_callable( $processCallback ) ) { |
329 | call_user_func_array( $processCallback, [ &$params, $args ] ); |
330 | } |
331 | |
332 | if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) { |
333 | // Remove the self-closing slash, to be consistent |
334 | // with HTML5 semantics. T134423 |
335 | $brace = '>'; |
336 | } |
337 | if ( !self::validateTag( $params, $t ) ) { |
338 | $badtag = true; |
339 | } |
340 | |
341 | $newparams = self::fixTagAttributes( $params, $t ); |
342 | if ( !$badtag ) { |
343 | if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) { |
344 | # Interpret self-closing tags as empty tags even when |
345 | # HTML 5 would interpret them as start tags. Such input |
346 | # is commonly seen on Wikimedia wikis with this intention. |
347 | $brace = "></$t>"; |
348 | } |
349 | |
350 | $rest = str_replace( '>', '>', $rest ); |
351 | $text .= "<$slash$t$newparams$brace$rest"; |
352 | continue; |
353 | } |
354 | } |
355 | } |
356 | $text .= '<' . str_replace( '>', '>', $x ); |
357 | } |
358 | return $text; |
359 | } |
360 | |
361 | /** |
362 | * Cleans up HTML, removes dangerous tags and attributes, and |
363 | * removes HTML comments; the result will always be balanced and |
364 | * tidy HTML. |
365 | * @param string $text Source string; see T268353 for why untainted |
366 | * @param-taint $text none |
367 | * @param array $options Options controlling the cleanup: |
368 | * string[] $options['extraTags'] Any extra tags to allow |
369 | * (This property taints the whole array.) |
370 | * string[] $options['removeTags'] Any tags (default or extra) to exclude |
371 | * callable(Attributes,...):Attributes $options['attrCallback'] Callback |
372 | * to do any variable or parameter replacements in HTML attribute |
373 | * values before further cleanup; should be considered @internal |
374 | * and not for external use. |
375 | * array $options['attrCallbackArgs'] Additional arguments for the |
376 | * attribute callback |
377 | * @param-taint $options tainted |
378 | * @return string The cleaned up HTML |
379 | * @return-taint escaped |
380 | * @since 1.38 |
381 | */ |
382 | public static function removeSomeTags( |
383 | string $text, array $options = [] |
384 | ): string { |
385 | $extraTags = $options['extraTags'] ?? []; |
386 | $removeTags = $options['removeTags'] ?? []; |
387 | // These options are @internal: |
388 | $attrCallback = $options['attrCallback'] ?? null; |
389 | $attrCallbackArgs = $options['attrCallbackArgs'] ?? []; |
390 | |
391 | // This disallows HTML5-style "missing trailing semicolon" attributes |
392 | // In wikitext "clean©" does *not* contain an entity. |
393 | $text = self::normalizeCharReferences( $text ); |
394 | |
395 | $tagData = self::getRecognizedTagData( $extraTags, $removeTags ); |
396 | // Use RemexHtml to tokenize $text and remove the barred tags |
397 | $formatter = new RemexCompatFormatter; |
398 | $serializer = new RemexSerializer( $formatter ); |
399 | $treeBuilder = new RemexTreeBuilder( $serializer, [ |
400 | 'ignoreErrors' => true, |
401 | 'ignoreNulls' => true, |
402 | ] ); |
403 | $dispatcher = new RemexDispatcher( $treeBuilder ); |
404 | $tokenHandler = $dispatcher; |
405 | $remover = new RemexRemoveTagHandler( |
406 | $tokenHandler, $text, $tagData, |
407 | $attrCallback, $attrCallbackArgs |
408 | ); |
409 | $tokenizer = new RemexTokenizer( $remover, $text, [ |
410 | 'ignoreErrors' => true, |
411 | // don't ignore char refs, we want them to be decoded |
412 | 'ignoreNulls' => true, |
413 | 'skipPreprocess' => true, |
414 | ] ); |
415 | $tokenizer->execute( [ |
416 | 'fragmentNamespace' => HTMLData::NS_HTML, |
417 | 'fragmentName' => 'body', |
418 | ] ); |
419 | return $serializer->getResult(); |
420 | } |
421 | |
422 | /** |
423 | * Remove '<!--', '-->', and everything between. |
424 | * To avoid leaving blank lines, when a comment is both preceded |
425 | * and followed by a newline (ignoring spaces), trim leading and |
426 | * trailing spaces and one of the newlines. |
427 | */ |
428 | public static function removeHTMLcomments( string $text ): string { |
429 | while ( ( $start = strpos( $text, '<!--' ) ) !== false ) { |
430 | $end = strpos( $text, '-->', $start + 4 ); |
431 | if ( $end === false ) { |
432 | # Unterminated comment; bail out |
433 | break; |
434 | } |
435 | |
436 | $end += 3; |
437 | |
438 | # Trim space and newline if the comment is both |
439 | # preceded and followed by a newline |
440 | $spaceStart = max( $start - 1, 0 ); |
441 | $spaceLen = $end - $spaceStart; |
442 | while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) { |
443 | $spaceStart--; |
444 | $spaceLen++; |
445 | } |
446 | while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) { |
447 | $spaceLen++; |
448 | } |
449 | if ( substr( $text, $spaceStart, 1 ) === "\n" |
450 | && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) { |
451 | # Remove the comment, leading and trailing |
452 | # spaces, and leave only one newline. |
453 | $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 ); |
454 | } else { |
455 | # Remove just the comment. |
456 | $text = substr_replace( $text, '', $start, $end - $start ); |
457 | } |
458 | } |
459 | return $text; |
460 | } |
461 | |
462 | /** |
463 | * Takes attribute names and values for a tag and the tag name and |
464 | * validates that the tag is allowed to be present. |
465 | * This DOES NOT validate the attributes, nor does it validate the |
466 | * tags themselves. This method only handles the special circumstances |
467 | * where we may want to allow a tag within content but ONLY when it has |
468 | * specific attributes set. |
469 | * |
470 | * @see RemexRemoveTagHandler::validateTag() |
471 | */ |
472 | private static function validateTag( string $params, string $element ): bool { |
473 | $params = self::decodeTagAttributes( $params ); |
474 | |
475 | if ( $element == 'meta' || $element == 'link' ) { |
476 | if ( !isset( $params['itemprop'] ) ) { |
477 | // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content |
478 | return false; |
479 | } |
480 | if ( $element == 'meta' && !isset( $params['content'] ) ) { |
481 | // <meta> must have a content="" for the itemprop |
482 | return false; |
483 | } |
484 | if ( $element == 'link' && !isset( $params['href'] ) ) { |
485 | // <link> must have an associated href="" |
486 | return false; |
487 | } |
488 | } |
489 | |
490 | return true; |
491 | } |
492 | |
493 | /** |
494 | * Take an array of attribute names and values and normalize or discard |
495 | * illegal values for the given element type. |
496 | * |
497 | * - Discards attributes not allowed for the given element |
498 | * - Unsafe style attributes are discarded |
499 | * - Invalid id attributes are re-encoded |
500 | * |
501 | * @todo Check for legal values where the DTD limits things. |
502 | * @todo Check for unique id attribute :P |
503 | */ |
504 | public static function validateTagAttributes( array $attribs, string $element ): array { |
505 | return self::validateAttributes( $attribs, |
506 | self::attributesAllowedInternal( $element ) ); |
507 | } |
508 | |
509 | /** |
510 | * Take an array of attribute names and values and normalize or discard |
511 | * illegal values. |
512 | * |
513 | * - Discards attributes not on the given list |
514 | * - Unsafe style attributes are discarded |
515 | * - Invalid id attributes are re-encoded |
516 | * |
517 | * @param array $attribs |
518 | * @param array $allowed List of allowed attribute names, |
519 | * as an associative array where keys give valid attribute names |
520 | * (since 1.34). Before 1.35, passing a sequential array of |
521 | * valid attribute names was permitted but that is now deprecated. |
522 | * @return array |
523 | * |
524 | * @todo Check for legal values where the DTD limits things. |
525 | * @todo Check for unique id attribute :P |
526 | */ |
527 | public static function validateAttributes( array $attribs, array $allowed ): array { |
528 | if ( isset( $allowed[0] ) ) { |
529 | // Calling this function with a sequential array is |
530 | // deprecated. For now just convert it. |
531 | wfDeprecated( __METHOD__ . ' with sequential array', '1.35' ); |
532 | $allowed = array_fill_keys( $allowed, true ); |
533 | } |
534 | $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/'; |
535 | |
536 | $out = []; |
537 | foreach ( $attribs as $attribute => $value ) { |
538 | # Allow XML namespace declaration to allow RDFa |
539 | if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) { |
540 | if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) { |
541 | $out[$attribute] = $value; |
542 | } |
543 | |
544 | continue; |
545 | } |
546 | |
547 | # Allow any attribute beginning with "data-" |
548 | # However: |
549 | # * Disallow data attributes used by MediaWiki code |
550 | # * Ensure that the attribute is not namespaced by banning |
551 | # colons. |
552 | if ( ( |
553 | !preg_match( '/^data-[^:]*$/i', $attribute ) && |
554 | !array_key_exists( $attribute, $allowed ) |
555 | ) || self::isReservedDataAttribute( $attribute ) ) { |
556 | continue; |
557 | } |
558 | |
559 | # Strip javascript "expression" from stylesheets. |
560 | # https://msdn.microsoft.com/en-us/library/ms537634.aspx |
561 | if ( $attribute == 'style' ) { |
562 | $value = self::checkCss( $value ); |
563 | } |
564 | |
565 | # Escape HTML id attributes |
566 | if ( $attribute === 'id' ) { |
567 | $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY ); |
568 | } |
569 | |
570 | # Escape HTML id reference lists |
571 | if ( $attribute === 'aria-describedby' |
572 | || $attribute === 'aria-flowto' |
573 | || $attribute === 'aria-labelledby' |
574 | || $attribute === 'aria-owns' |
575 | ) { |
576 | $value = self::escapeIdReferenceListInternal( $value ); |
577 | } |
578 | |
579 | // RDFa and microdata properties allow URLs, URIs and/or CURIs. |
580 | if ( $attribute === 'rel' || $attribute === 'rev' |
581 | # RDFa |
582 | || $attribute === 'about' || $attribute === 'property' |
583 | || $attribute === 'resource' || $attribute === 'datatype' |
584 | || $attribute === 'typeof' |
585 | # HTML5 microdata |
586 | || $attribute === 'itemid' || $attribute === 'itemprop' |
587 | || $attribute === 'itemref' || $attribute === 'itemscope' |
588 | || $attribute === 'itemtype' |
589 | ) { |
590 | // Paranoia. Allow "simple" values but suppress javascript |
591 | if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) { |
592 | continue; |
593 | } |
594 | } |
595 | |
596 | # NOTE: even though elements using href/src are not allowed directly, supply |
597 | # validation code that can be used by tag hook handlers, etc |
598 | if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) { |
599 | if ( !preg_match( $hrefExp, $value ) ) { |
600 | continue; // drop any href or src attributes not using an allowed protocol. |
601 | // NOTE: this also drops all relative URLs |
602 | } |
603 | } |
604 | |
605 | if ( $attribute === 'tabindex' && $value !== '0' ) { |
606 | // Only allow tabindex of 0, which is useful for accessibility. |
607 | continue; |
608 | } |
609 | |
610 | // If this attribute was previously set, override it. |
611 | // Output should only have one attribute of each name. |
612 | $out[$attribute] = $value; |
613 | } |
614 | |
615 | # itemtype, itemid, itemref don't make sense without itemscope |
616 | if ( !array_key_exists( 'itemscope', $out ) ) { |
617 | unset( $out['itemtype'] ); |
618 | unset( $out['itemid'] ); |
619 | unset( $out['itemref'] ); |
620 | } |
621 | # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref. |
622 | |
623 | return $out; |
624 | } |
625 | |
626 | /** |
627 | * Given an attribute name, checks whether it is a reserved data attribute |
628 | * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki |
629 | * core and extension code can safely use it to communicate with frontend code. |
630 | * @param string $attr Attribute name. |
631 | * @return bool |
632 | */ |
633 | public static function isReservedDataAttribute( string $attr ): bool { |
634 | // data-ooui is reserved for ooui. |
635 | // data-mw and data-parsoid are reserved for parsoid. |
636 | // data-mw-<name here> is reserved for extensions (or core) if |
637 | // they need to communicate some data to the client and want to be |
638 | // sure that it isn't coming from an untrusted user. |
639 | // We ignore the possibility of namespaces since user-generated HTML |
640 | // can't use them anymore. |
641 | return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr ); |
642 | } |
643 | |
644 | /** |
645 | * Merge two sets of HTML attributes. Conflicting items in the second set |
646 | * will override those in the first, except for 'class' attributes which |
647 | * will be combined (if they're both strings). |
648 | * |
649 | * @todo implement merging for other attributes such as style |
650 | */ |
651 | public static function mergeAttributes( array $a, array $b ): array { |
652 | $out = array_merge( $a, $b ); |
653 | if ( isset( $a['class'] ) && isset( $b['class'] ) |
654 | && is_string( $a['class'] ) && is_string( $b['class'] ) |
655 | && $a['class'] !== $b['class'] |
656 | ) { |
657 | $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}", |
658 | -1, PREG_SPLIT_NO_EMPTY ); |
659 | $out['class'] = implode( ' ', array_unique( $classes ) ); |
660 | } |
661 | return $out; |
662 | } |
663 | |
664 | /** |
665 | * Normalize CSS into a format we can easily search for hostile input |
666 | * - decode character references |
667 | * - decode escape sequences |
668 | * - remove comments, unless the entire value is one single comment |
669 | * @param string $value the css string |
670 | * @return string normalized css |
671 | */ |
672 | public static function normalizeCss( string $value ): string { |
673 | // Decode character references like { |
674 | $value = self::decodeCharReferences( $value ); |
675 | |
676 | // Decode escape sequences and line continuation |
677 | // See the grammar in the CSS 2 spec, appendix D. |
678 | // This has to be done AFTER decoding character references. |
679 | // This means it isn't possible for this function to return |
680 | // unsanitized escape sequences. It is possible to manufacture |
681 | // input that contains character references that decode to |
682 | // escape sequences that decode to character references, but |
683 | // it's OK for the return value to contain character references |
684 | // because the caller is supposed to escape those anyway. |
685 | static $decodeRegex; |
686 | if ( !$decodeRegex ) { |
687 | $space = '[\\x20\\t\\r\\n\\f]'; |
688 | $nl = '(?:\\n|\\r\\n|\\r|\\f)'; |
689 | $backslash = '\\\\'; |
690 | $decodeRegex = "/ $backslash |
691 | (?: |
692 | ($nl) | # 1. Line continuation |
693 | ([0-9A-Fa-f]{1,6})$space? | # 2. character number |
694 | (.) | # 3. backslash cancelling special meaning |
695 | () | # 4. backslash at end of string |
696 | )/xu"; |
697 | } |
698 | $value = preg_replace_callback( $decodeRegex, |
699 | [ __CLASS__, 'cssDecodeCallback' ], $value ); |
700 | |
701 | // Let the value through if it's nothing but a single comment, to |
702 | // allow other functions which may reject it to pass some error |
703 | // message through. |
704 | if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) { |
705 | // Remove any comments; IE gets token splitting wrong |
706 | // This must be done AFTER decoding character references and |
707 | // escape sequences, because those steps can introduce comments |
708 | // This step cannot introduce character references or escape |
709 | // sequences, because it replaces comments with spaces rather |
710 | // than removing them completely. |
711 | $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); |
712 | |
713 | // Remove anything after a comment-start token, to guard against |
714 | // incorrect client implementations. |
715 | $commentPos = strpos( $value, '/*' ); |
716 | if ( $commentPos !== false ) { |
717 | $value = substr( $value, 0, $commentPos ); |
718 | } |
719 | } |
720 | |
721 | return $value; |
722 | } |
723 | |
724 | /** |
725 | * Pick apart some CSS and check it for forbidden or unsafe structures. |
726 | * Returns a sanitized string. This sanitized string will have |
727 | * character references and escape sequences decoded and comments |
728 | * stripped (unless it is itself one valid comment, in which case the value |
729 | * will be passed through). If the input is just too evil, only a comment |
730 | * complaining about evilness will be returned. |
731 | * |
732 | * Currently URL references, 'expression', 'tps' are forbidden. |
733 | * |
734 | * NOTE: Despite the fact that character references are decoded, the |
735 | * returned string may contain character references given certain |
736 | * clever input strings. These character references must |
737 | * be escaped before the return value is embedded in HTML. |
738 | * |
739 | * @param string $value |
740 | * @return string |
741 | */ |
742 | public static function checkCss( $value ) { |
743 | $value = self::normalizeCss( $value ); |
744 | |
745 | // Reject problematic keywords and control characters |
746 | if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) || |
747 | strpos( $value, \UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) { |
748 | return '/* invalid control char */'; |
749 | } elseif ( preg_match( |
750 | '! expression |
751 | | accelerator\s*: |
752 | | -o-link\s*: |
753 | | -o-link-source\s*: |
754 | | -o-replace\s*: |
755 | | url\s*\( |
756 | | image\s*\( |
757 | | image-set\s*\( |
758 | | attr\s*\([^)]+[\s,]+url |
759 | !ix', $value ) ) { |
760 | return '/* insecure input */'; |
761 | } |
762 | return $value; |
763 | } |
764 | |
765 | private static function cssDecodeCallback( array $matches ): string { |
766 | if ( $matches[1] !== '' ) { |
767 | // Line continuation |
768 | return ''; |
769 | } elseif ( $matches[2] !== '' ) { |
770 | # hexdec could return a float if the match is too long, but the |
771 | # regexp in question limits the string length to 6. |
772 | $char = \UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) ); |
773 | } elseif ( $matches[3] !== '' ) { |
774 | $char = $matches[3]; |
775 | } else { |
776 | $char = '\\'; |
777 | } |
778 | if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { |
779 | // These characters need to be escaped in strings |
780 | // Clean up the escape sequence to avoid parsing errors by clients |
781 | return '\\' . dechex( ord( $char ) ) . ' '; |
782 | } else { |
783 | // Decode unnecessary escape |
784 | return $char; |
785 | } |
786 | } |
787 | |
788 | /** |
789 | * Take a tag soup fragment listing an HTML element's attributes |
790 | * and normalize it to well-formed XML, discarding unwanted attributes. |
791 | * Output is safe for further wikitext processing, with escaping of |
792 | * values that could trigger problems. |
793 | * |
794 | * - Normalizes attribute names to lowercase |
795 | * - Discards attributes not allowed for the given element |
796 | * - Turns broken or invalid entities into plaintext |
797 | * - Double-quotes all attribute values |
798 | * - Attributes without values are given the name as attribute |
799 | * - Double attributes are discarded |
800 | * - Unsafe style attributes are discarded |
801 | * - Prepends space if there are attributes. |
802 | * - (Optionally) Sorts attributes by name. |
803 | * |
804 | * @param string $text |
805 | * @param string $element |
806 | * @param bool $sorted Whether to sort the attributes (default: false) |
807 | * @return string |
808 | */ |
809 | public static function fixTagAttributes( string $text, string $element, bool $sorted = false ): string { |
810 | if ( trim( $text ) == '' ) { |
811 | return ''; |
812 | } |
813 | |
814 | $decoded = self::decodeTagAttributes( $text ); |
815 | $stripped = self::validateTagAttributes( $decoded, $element ); |
816 | |
817 | if ( $sorted ) { |
818 | ksort( $stripped ); |
819 | } |
820 | |
821 | return self::safeEncodeTagAttributes( $stripped ); |
822 | } |
823 | |
824 | /** |
825 | * Encode an attribute value for HTML output. |
826 | * @param string $text |
827 | * @param-taint $text escapes_html |
828 | * @return string HTML-encoded text fragment |
829 | * @return-taint escaped |
830 | */ |
831 | public static function encodeAttribute( string $text ): string { |
832 | $encValue = htmlspecialchars( $text, ENT_QUOTES ); |
833 | |
834 | // Whitespace is normalized during attribute decoding, |
835 | // so if we've been passed non-spaces we must encode them |
836 | // ahead of time or they won't be preserved. |
837 | $encValue = strtr( $encValue, [ |
838 | "\n" => ' ', |
839 | "\r" => ' ', |
840 | "\t" => '	', |
841 | ] ); |
842 | |
843 | return $encValue; |
844 | } |
845 | |
846 | /** |
847 | * Armor French spaces with a replacement character |
848 | * |
849 | * @since 1.32 |
850 | * @param string $text Text to armor |
851 | * @param string $space Space character for the French spaces, defaults to ' ' |
852 | * @return string Armored text |
853 | */ |
854 | public static function armorFrenchSpaces( string $text, string $space = ' ' ): string { |
855 | // Replace $ with \$ and \ with \\ |
856 | $space = preg_replace( '#(?<!\\\\)(\\$|\\\\)#', '\\\\$1', $space ); |
857 | $fixtags = [ |
858 | # French spaces, last one Guillemet-left |
859 | # only if it isn't followed by a word character. |
860 | '/ (?=[?:;!%»›](?!\w))/u' => "$space", |
861 | # French spaces, Guillemet-right |
862 | '/([«‹]) /u' => "\\1$space", |
863 | ]; |
864 | return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text ); |
865 | } |
866 | |
867 | /** |
868 | * Encode an attribute value for HTML tags, with extra armoring |
869 | * against further wiki processing. |
870 | * @param string $text |
871 | * @param-taint $text escapes_html |
872 | * @return string HTML-encoded text fragment |
873 | * @return-taint escaped |
874 | */ |
875 | public static function safeEncodeAttribute( string $text ): string { |
876 | $encValue = self::encodeAttribute( $text ); |
877 | |
878 | # Templates and links may be expanded in later parsing, |
879 | # creating invalid or dangerous output. Suppress this. |
880 | $encValue = strtr( $encValue, [ |
881 | // '<', '>', and '"' should never happen, as they indicate that we've received invalid input which should |
882 | // have been escaped. |
883 | '<' => '<', |
884 | '>' => '>', |
885 | '"' => '"', |
886 | '{' => '{', |
887 | '}' => '}', // prevent unpaired language conversion syntax |
888 | '[' => '[', |
889 | ']' => ']', |
890 | "''" => '''', |
891 | 'ISBN' => 'ISBN', |
892 | 'RFC' => 'RFC', |
893 | 'PMID' => 'PMID', |
894 | '|' => '|', |
895 | '__' => '__', |
896 | ] ); |
897 | |
898 | # Stupid hack |
899 | $encValue = preg_replace_callback( |
900 | '/((?i)' . wfUrlProtocols() . ')/', |
901 | static function ( $matches ) { |
902 | return str_replace( ':', ':', $matches[1] ); |
903 | }, |
904 | $encValue ); |
905 | return $encValue; |
906 | } |
907 | |
908 | /** |
909 | * Given a section name or other user-generated or otherwise unsafe string, escapes it to be |
910 | * a valid HTML id attribute. |
911 | * |
912 | * WARNING: The output of this function is not guaranteed to be HTML safe, so be sure to use |
913 | * proper escaping. |
914 | * |
915 | * @param string $id String to escape |
916 | * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding |
917 | * should be used. |
918 | * @return string|false Escaped ID or false if fallback encoding is requested but it's not |
919 | * configured. |
920 | * |
921 | * @since 1.30 |
922 | */ |
923 | public static function escapeIdForAttribute( string $id, int $mode = self::ID_PRIMARY ) { |
924 | global $wgFragmentMode; |
925 | |
926 | if ( !isset( $wgFragmentMode[$mode] ) ) { |
927 | if ( $mode === self::ID_PRIMARY ) { |
928 | throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' ); |
929 | } |
930 | return false; |
931 | } |
932 | |
933 | $internalMode = $wgFragmentMode[$mode]; |
934 | |
935 | return self::escapeIdInternal( $id, $internalMode ); |
936 | } |
937 | |
938 | /** |
939 | * Given a section name or other user-generated or otherwise unsafe string, escapes it to be |
940 | * a valid URL fragment. |
941 | * |
942 | * WARNING: The output of this function is not guaranteed to be HTML safe, so be sure to use |
943 | * proper escaping. |
944 | * |
945 | * @param string $id String to escape |
946 | * @return string Escaped ID |
947 | * |
948 | * @since 1.30 |
949 | */ |
950 | public static function escapeIdForLink( string $id ): string { |
951 | global $wgFragmentMode; |
952 | |
953 | if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) { |
954 | throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' ); |
955 | } |
956 | |
957 | $mode = $wgFragmentMode[self::ID_PRIMARY]; |
958 | |
959 | $id = self::escapeIdInternalUrl( $id, $mode ); |
960 | |
961 | return $id; |
962 | } |
963 | |
964 | /** |
965 | * Given a section name or other user-generated or otherwise unsafe string, escapes it to be |
966 | * a valid URL fragment for external interwikis. |
967 | * |
968 | * @param string $id String to escape |
969 | * @return string Escaped ID |
970 | * |
971 | * @since 1.30 |
972 | */ |
973 | public static function escapeIdForExternalInterwiki( string $id ): string { |
974 | global $wgExternalInterwikiFragmentMode; |
975 | |
976 | $id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode ); |
977 | |
978 | return $id; |
979 | } |
980 | |
981 | /** |
982 | * Do percent encoding of percent signs for href (but not id) attributes |
983 | * |
984 | * @since 1.35 |
985 | * @see https://phabricator.wikimedia.org/T238385 |
986 | * @param string $id String to escape |
987 | * @param string $mode One of modes from $wgFragmentMode |
988 | * @return string |
989 | */ |
990 | private static function escapeIdInternalUrl( string $id, string $mode ): string { |
991 | $id = self::escapeIdInternal( $id, $mode ); |
992 | if ( $mode === 'html5' ) { |
993 | $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id ); |
994 | } |
995 | return $id; |
996 | } |
997 | |
998 | /** |
999 | * Helper for escapeIdFor*() functions. Performs most of the actual escaping. |
1000 | * |
1001 | * @param string $id String to escape |
1002 | * @param string $mode One of modes from $wgFragmentMode |
1003 | * @return string |
1004 | */ |
1005 | private static function escapeIdInternal( string $id, string $mode ): string { |
1006 | // Truncate overly-long IDs. This isn't an HTML limit, it's just |
1007 | // griefer protection. [T251506] |
1008 | $id = mb_substr( $id, 0, 1024 ); |
1009 | |
1010 | switch ( $mode ) { |
1011 | case 'html5': |
1012 | // html5 spec says ids must not have any of the following: |
1013 | // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE |
1014 | // In practice, in wikitext, only tab, LF, CR (and SPACE) are |
1015 | // possible using either Lua or html entities. |
1016 | $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id ); |
1017 | break; |
1018 | case 'legacy': |
1019 | // This corresponds to 'noninitial' mode of the former escapeId() |
1020 | static $replace = [ |
1021 | '%3A' => ':', |
1022 | '%' => '.' |
1023 | ]; |
1024 | |
1025 | $id = urlencode( str_replace( ' ', '_', $id ) ); |
1026 | $id = strtr( $id, $replace ); |
1027 | break; |
1028 | default: |
1029 | throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ ); |
1030 | } |
1031 | |
1032 | return $id; |
1033 | } |
1034 | |
1035 | /** |
1036 | * Given a string containing a space delimited list of ids, escape each id |
1037 | * to match ids escaped by the escapeIdForAttribute() function. |
1038 | * |
1039 | * @param string $referenceString Space delimited list of ids |
1040 | * @return string |
1041 | */ |
1042 | private static function escapeIdReferenceListInternal( string $referenceString ): string { |
1043 | # Explode the space delimited list string into an array of tokens |
1044 | $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY ); |
1045 | |
1046 | # Escape each token as an id |
1047 | foreach ( $references as &$ref ) { |
1048 | $ref = self::escapeIdForAttribute( $ref ); |
1049 | } |
1050 | |
1051 | # Merge the array back to a space delimited list string |
1052 | # If the array is empty, the result will be an empty string ('') |
1053 | $referenceString = implode( ' ', $references ); |
1054 | |
1055 | return $referenceString; |
1056 | } |
1057 | |
1058 | /** |
1059 | * Given a value, escape it so that it can be used as a CSS class and |
1060 | * return it. |
1061 | * |
1062 | * @todo For extra validity, input should be validated UTF-8. |
1063 | * |
1064 | * @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format |
1065 | */ |
1066 | public static function escapeClass( string $class ): string { |
1067 | // Convert ugly stuff to underscores and kill underscores in ugly places |
1068 | return rtrim( preg_replace( |
1069 | [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ], |
1070 | '_', |
1071 | $class ), '_' ); |
1072 | } |
1073 | |
1074 | /** |
1075 | * Given HTML input, escape with htmlspecialchars but un-escape entities. |
1076 | * This allows (generally harmless) entities like   to survive. |
1077 | * |
1078 | * @param string $html HTML to escape |
1079 | * @param-taint $html escapes_htmlnoent |
1080 | * @return string Escaped input |
1081 | * @return-taint escaped |
1082 | */ |
1083 | public static function escapeHtmlAllowEntities( string $html ): string { |
1084 | $html = self::decodeCharReferences( $html ); |
1085 | # It seems wise to escape ' as well as ", as a matter of course. Can't |
1086 | # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters |
1087 | # don't cause the entire string to disappear. |
1088 | $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE ); |
1089 | return $html; |
1090 | } |
1091 | |
1092 | /** |
1093 | * Return an associative array of attribute names and values from |
1094 | * a partial tag string. Attribute names are forced to lowercase, |
1095 | * character references are decoded to UTF-8 text. |
1096 | */ |
1097 | public static function decodeTagAttributes( string $text ): array { |
1098 | if ( trim( $text ) == '' ) { |
1099 | return []; |
1100 | } |
1101 | |
1102 | $pairs = []; |
1103 | if ( !preg_match_all( |
1104 | self::getAttribsRegex(), |
1105 | $text, |
1106 | $pairs, |
1107 | PREG_SET_ORDER ) ) { |
1108 | return []; |
1109 | } |
1110 | |
1111 | $attribs = []; |
1112 | foreach ( $pairs as $set ) { |
1113 | $attribute = strtolower( $set[1] ); |
1114 | |
1115 | // Filter attribute names with unacceptable characters |
1116 | if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) { |
1117 | continue; |
1118 | } |
1119 | |
1120 | $value = self::getTagAttributeCallback( $set ); |
1121 | |
1122 | // Normalize whitespace |
1123 | $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); |
1124 | $value = trim( $value ); |
1125 | |
1126 | // Decode character references |
1127 | $attribs[$attribute] = self::decodeCharReferences( $value ); |
1128 | } |
1129 | return $attribs; |
1130 | } |
1131 | |
1132 | /** |
1133 | * Build a partial tag string from an associative array of attribute |
1134 | * names and values as returned by decodeTagAttributes. |
1135 | */ |
1136 | public static function safeEncodeTagAttributes( array $assoc_array ): string { |
1137 | $attribs = []; |
1138 | foreach ( $assoc_array as $attribute => $value ) { |
1139 | $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT ); |
1140 | $encValue = self::safeEncodeAttribute( $value ); |
1141 | |
1142 | $attribs[] = "$encAttribute=\"$encValue\""; |
1143 | } |
1144 | return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; |
1145 | } |
1146 | |
1147 | /** |
1148 | * Pick the appropriate attribute value from a match set from the |
1149 | * attribs regex matches. |
1150 | */ |
1151 | private static function getTagAttributeCallback( array $set ): string { |
1152 | if ( isset( $set[5] ) ) { |
1153 | # No quotes. |
1154 | return $set[5]; |
1155 | } elseif ( isset( $set[4] ) ) { |
1156 | # Single-quoted |
1157 | return $set[4]; |
1158 | } elseif ( isset( $set[3] ) ) { |
1159 | # Double-quoted |
1160 | return $set[3]; |
1161 | } elseif ( !isset( $set[2] ) ) { |
1162 | # In XHTML, attributes must have a value so return an empty string. |
1163 | # See "Empty attribute syntax", |
1164 | # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name |
1165 | return ""; |
1166 | } else { |
1167 | throw new LogicException( "Tag conditions not met. This should never happen and is a bug." ); |
1168 | } |
1169 | } |
1170 | |
1171 | private static function normalizeWhitespace( string $text ): string { |
1172 | return trim( preg_replace( |
1173 | '/(?:\r\n|[\x20\x0d\x0a\x09])+/', |
1174 | ' ', |
1175 | $text ) ); |
1176 | } |
1177 | |
1178 | /** |
1179 | * Normalizes whitespace in a section name, such as might be returned |
1180 | * by Parser::stripSectionName(), for use in the id's that are used for |
1181 | * section links. |
1182 | */ |
1183 | public static function normalizeSectionNameWhitespace( string $section ): string { |
1184 | return trim( preg_replace( '/[ _]+/', ' ', $section ) ); |
1185 | } |
1186 | |
1187 | /** |
1188 | * Ensure that any entities and character references are legal |
1189 | * for XML and XHTML specifically. Any stray bits will be |
1190 | * &-escaped to result in a valid text fragment. |
1191 | * |
1192 | * a. named char refs can only be < > & ", others are |
1193 | * numericized (this way we're well-formed even without a DTD) |
1194 | * b. any numeric char refs must be legal chars, not invalid or forbidden |
1195 | * c. use lower cased "&#x", not "&#X" |
1196 | * d. fix or reject non-valid attributes |
1197 | * |
1198 | * @internal |
1199 | */ |
1200 | public static function normalizeCharReferences( string $text ): string { |
1201 | return preg_replace_callback( |
1202 | self::CHAR_REFS_REGEX, |
1203 | [ self::class, 'normalizeCharReferencesCallback' ], |
1204 | $text, -1, $count, PREG_UNMATCHED_AS_NULL |
1205 | ); |
1206 | } |
1207 | |
1208 | private static function normalizeCharReferencesCallback( array $matches ): string { |
1209 | $ret = null; |
1210 | if ( isset( $matches[1] ) ) { |
1211 | $ret = self::normalizeEntity( $matches[1] ); |
1212 | } elseif ( isset( $matches[2] ) ) { |
1213 | $ret = self::decCharReference( $matches[2] ); |
1214 | } elseif ( isset( $matches[3] ) ) { |
1215 | $ret = self::hexCharReference( $matches[3] ); |
1216 | } |
1217 | if ( $ret === null ) { |
1218 | return htmlspecialchars( $matches[0], ENT_COMPAT ); |
1219 | } else { |
1220 | return $ret; |
1221 | } |
1222 | } |
1223 | |
1224 | /** |
1225 | * If the named entity is defined in HTML5 |
1226 | * return the equivalent numeric entity reference (except for the core < |
1227 | * > & "). If the entity is a MediaWiki-specific alias, returns |
1228 | * the HTML equivalent. Otherwise, returns HTML-escaped text of |
1229 | * pseudo-entity source (eg &foo;) |
1230 | * |
1231 | * @param string $name Semicolon-terminated name |
1232 | * @return string |
1233 | */ |
1234 | private static function normalizeEntity( string $name ): string { |
1235 | if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) { |
1236 | // Non-standard MediaWiki-specific entities |
1237 | return '&' . self::MW_ENTITY_ALIASES[$name]; |
1238 | } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) { |
1239 | // Keep these in word form |
1240 | return "&$name"; |
1241 | } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) { |
1242 | // Beware: some entities expand to more than 1 codepoint |
1243 | return preg_replace_callback( '/./Ssu', static function ( $m ) { |
1244 | return '&#' . \UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';'; |
1245 | }, HTMLData::$namedEntityTranslations[$name] ); |
1246 | } else { |
1247 | return "&$name"; |
1248 | } |
1249 | } |
1250 | |
1251 | private static function decCharReference( string $codepoint ): ?string { |
1252 | # intval() will (safely) saturate at the maximum signed integer |
1253 | # value if $codepoint is too many digits |
1254 | $point = intval( $codepoint ); |
1255 | if ( self::validateCodepoint( $point ) ) { |
1256 | return "&#$point;"; |
1257 | } else { |
1258 | return null; |
1259 | } |
1260 | } |
1261 | |
1262 | private static function hexCharReference( string $codepoint ): ?string { |
1263 | $point = hexdec( $codepoint ); |
1264 | // hexdec() might return a float if the string is too long |
1265 | if ( is_int( $point ) && self::validateCodepoint( $point ) ) { |
1266 | return sprintf( '&#x%x;', $point ); |
1267 | } else { |
1268 | return null; |
1269 | } |
1270 | } |
1271 | |
1272 | /** |
1273 | * Returns true if a given Unicode codepoint is a valid character in |
1274 | * both HTML5 and XML. |
1275 | */ |
1276 | private static function validateCodepoint( int $codepoint ): bool { |
1277 | # U+000C is valid in HTML5 but not allowed in XML. |
1278 | # U+000D is valid in XML but not allowed in HTML5. |
1279 | # U+007F - U+009F are disallowed in HTML5 (control characters). |
1280 | return $codepoint == 0x09 |
1281 | || $codepoint == 0x0a |
1282 | || ( $codepoint >= 0x20 && $codepoint <= 0x7e ) |
1283 | || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff ) |
1284 | || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd ) |
1285 | || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff ); |
1286 | } |
1287 | |
1288 | /** |
1289 | * Decode any character references, numeric or named entities, |
1290 | * in the text and return a UTF-8 string. |
1291 | */ |
1292 | public static function decodeCharReferences( string $text ): string { |
1293 | return preg_replace_callback( |
1294 | self::CHAR_REFS_REGEX, |
1295 | [ self::class, 'decodeCharReferencesCallback' ], |
1296 | $text, -1, $count, PREG_UNMATCHED_AS_NULL |
1297 | ); |
1298 | } |
1299 | |
1300 | /** |
1301 | * Decode any character references, numeric or named entities, |
1302 | * in the next and normalize the resulting string. (T16952) |
1303 | * |
1304 | * This is useful for page titles, not for text to be displayed, |
1305 | * MediaWiki allows HTML entities to escape normalization as a feature. |
1306 | * |
1307 | * @param string $text Already normalized, containing entities |
1308 | * @return string Still normalized, without entities |
1309 | */ |
1310 | public static function decodeCharReferencesAndNormalize( string $text ): string { |
1311 | $text = preg_replace_callback( |
1312 | self::CHAR_REFS_REGEX, |
1313 | [ self::class, 'decodeCharReferencesCallback' ], |
1314 | $text, -1, $count, PREG_UNMATCHED_AS_NULL |
1315 | ); |
1316 | |
1317 | if ( $count ) { |
1318 | return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text ); |
1319 | } else { |
1320 | return $text; |
1321 | } |
1322 | } |
1323 | |
1324 | private static function decodeCharReferencesCallback( array $matches ): string { |
1325 | if ( isset( $matches[1] ) ) { |
1326 | return self::decodeEntity( $matches[1] ); |
1327 | } elseif ( isset( $matches[2] ) ) { |
1328 | return self::decodeChar( intval( $matches[2] ) ); |
1329 | } elseif ( isset( $matches[3] ) ) { |
1330 | $point = hexdec( $matches[3] ); |
1331 | // hexdec() might return a float if the string is too long |
1332 | if ( !is_int( $point ) ) { |
1333 | // Invalid character reference. |
1334 | return \UtfNormal\Constants::UTF8_REPLACEMENT; |
1335 | } |
1336 | return self::decodeChar( $point ); |
1337 | } |
1338 | # Last case should be an ampersand by itself |
1339 | return $matches[0]; |
1340 | } |
1341 | |
1342 | /** |
1343 | * Return UTF-8 string for a codepoint if that is a valid |
1344 | * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. |
1345 | * @internal |
1346 | */ |
1347 | private static function decodeChar( int $codepoint ): string { |
1348 | if ( self::validateCodepoint( $codepoint ) ) { |
1349 | return \UtfNormal\Utils::codepointToUtf8( $codepoint ); |
1350 | } else { |
1351 | return \UtfNormal\Constants::UTF8_REPLACEMENT; |
1352 | } |
1353 | } |
1354 | |
1355 | /** |
1356 | * If the named entity is defined in HTML5 |
1357 | * return the UTF-8 encoding of that character. Otherwise, returns |
1358 | * pseudo-entity source (eg "&foo;") |
1359 | * |
1360 | * @param string $name Semicolon-terminated entity name |
1361 | * @return string |
1362 | */ |
1363 | private static function decodeEntity( string $name ): string { |
1364 | // These are MediaWiki-specific entities, not in the HTML standard |
1365 | if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) { |
1366 | $name = self::MW_ENTITY_ALIASES[$name]; |
1367 | } |
1368 | $trans = HTMLData::$namedEntityTranslations[$name] ?? null; |
1369 | return $trans ?? "&$name"; |
1370 | } |
1371 | |
1372 | /** |
1373 | * Fetch the list of acceptable attributes for a given element name. |
1374 | * |
1375 | * @param string $element |
1376 | * @return array An associative array where keys are acceptable attribute |
1377 | * names |
1378 | */ |
1379 | private static function attributesAllowedInternal( string $element ): array { |
1380 | $list = self::setupAttributesAllowedInternal(); |
1381 | return $list[$element] ?? []; |
1382 | } |
1383 | |
1384 | /** |
1385 | * Foreach array key (an allowed HTML element), return an array |
1386 | * of allowed attributes. |
1387 | * @return array An associative array: keys are HTML element names; |
1388 | * values are associative arrays where the keys are allowed attribute |
1389 | * names. |
1390 | */ |
1391 | private static function setupAttributesAllowedInternal(): array { |
1392 | static $allowed; |
1393 | |
1394 | if ( $allowed !== null ) { |
1395 | return $allowed; |
1396 | } |
1397 | |
1398 | // For lookup efficiency flip each attributes array so the keys are |
1399 | // the valid attributes. |
1400 | $merge = static function ( $a, $b, $c = [] ) { |
1401 | return array_merge( |
1402 | $a, |
1403 | array_fill_keys( $b, true ), |
1404 | array_fill_keys( $c, true ) ); |
1405 | }; |
1406 | $common = $merge( [], [ |
1407 | # HTML |
1408 | 'id', |
1409 | 'class', |
1410 | 'style', |
1411 | 'lang', |
1412 | 'dir', |
1413 | 'title', |
1414 | 'tabindex', |
1415 | |
1416 | # WAI-ARIA |
1417 | 'aria-describedby', |
1418 | 'aria-flowto', |
1419 | 'aria-hidden', |
1420 | 'aria-label', |
1421 | 'aria-labelledby', |
1422 | 'aria-level', |
1423 | 'aria-owns', |
1424 | 'role', |
1425 | |
1426 | # RDFa |
1427 | # These attributes are specified in section 9 of |
1428 | # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 |
1429 | 'about', |
1430 | 'property', |
1431 | 'resource', |
1432 | 'datatype', |
1433 | 'typeof', |
1434 | |
1435 | # Microdata. These are specified by |
1436 | # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model |
1437 | 'itemid', |
1438 | 'itemprop', |
1439 | 'itemref', |
1440 | 'itemscope', |
1441 | 'itemtype', |
1442 | ] ); |
1443 | |
1444 | $block = $merge( $common, [ 'align' ] ); |
1445 | |
1446 | $tablealign = [ 'align', 'valign' ]; |
1447 | $tablecell = [ |
1448 | 'abbr', |
1449 | 'axis', |
1450 | 'headers', |
1451 | 'scope', |
1452 | 'rowspan', |
1453 | 'colspan', |
1454 | 'nowrap', # deprecated |
1455 | 'width', # deprecated |
1456 | 'height', # deprecated |
1457 | 'bgcolor', # deprecated |
1458 | ]; |
1459 | |
1460 | # Numbers refer to sections in HTML 4.01 standard describing the element. |
1461 | # See: https://www.w3.org/TR/html4/ |
1462 | $allowed = [ |
1463 | # 7.5.4 |
1464 | 'div' => $block, |
1465 | 'center' => $common, # deprecated |
1466 | 'span' => $common, |
1467 | |
1468 | # 7.5.5 |
1469 | 'h1' => $block, |
1470 | 'h2' => $block, |
1471 | 'h3' => $block, |
1472 | 'h4' => $block, |
1473 | 'h5' => $block, |
1474 | 'h6' => $block, |
1475 | |
1476 | # 7.5.6 |
1477 | # address |
1478 | |
1479 | # 8.2.4 |
1480 | 'bdo' => $common, |
1481 | |
1482 | # 9.2.1 |
1483 | 'em' => $common, |
1484 | 'strong' => $common, |
1485 | 'cite' => $common, |
1486 | 'dfn' => $common, |
1487 | 'code' => $common, |
1488 | 'samp' => $common, |
1489 | 'kbd' => $common, |
1490 | 'var' => $common, |
1491 | 'abbr' => $common, |
1492 | # acronym |
1493 | |
1494 | # 9.2.2 |
1495 | 'blockquote' => $merge( $common, [ 'cite' ] ), |
1496 | 'q' => $merge( $common, [ 'cite' ] ), |
1497 | |
1498 | # 9.2.3 |
1499 | 'sub' => $common, |
1500 | 'sup' => $common, |
1501 | |
1502 | # 9.3.1 |
1503 | 'p' => $block, |
1504 | |
1505 | # 9.3.2 |
1506 | 'br' => $merge( $common, [ 'clear' ] ), |
1507 | |
1508 | # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element |
1509 | 'wbr' => $common, |
1510 | |
1511 | # 9.3.4 |
1512 | 'pre' => $merge( $common, [ 'width' ] ), |
1513 | |
1514 | # 9.4 |
1515 | 'ins' => $merge( $common, [ 'cite', 'datetime' ] ), |
1516 | 'del' => $merge( $common, [ 'cite', 'datetime' ] ), |
1517 | |
1518 | # 10.2 |
1519 | 'ul' => $merge( $common, [ 'type' ] ), |
1520 | 'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ), |
1521 | 'li' => $merge( $common, [ 'type', 'value' ] ), |
1522 | |
1523 | # 10.3 |
1524 | 'dl' => $common, |
1525 | 'dd' => $common, |
1526 | 'dt' => $common, |
1527 | |
1528 | # 11.2.1 |
1529 | 'table' => $merge( $common, |
1530 | [ 'summary', 'width', 'border', 'frame', |
1531 | 'rules', 'cellspacing', 'cellpadding', |
1532 | 'align', 'bgcolor', |
1533 | ] ), |
1534 | |
1535 | # 11.2.2 |
1536 | 'caption' => $block, |
1537 | |
1538 | # 11.2.3 |
1539 | 'thead' => $common, |
1540 | 'tfoot' => $common, |
1541 | 'tbody' => $common, |
1542 | |
1543 | # 11.2.4 |
1544 | 'colgroup' => $merge( $common, [ 'span' ] ), |
1545 | 'col' => $merge( $common, [ 'span' ] ), |
1546 | |
1547 | # 11.2.5 |
1548 | 'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ), |
1549 | |
1550 | # 11.2.6 |
1551 | 'td' => $merge( $common, $tablecell, $tablealign ), |
1552 | 'th' => $merge( $common, $tablecell, $tablealign ), |
1553 | |
1554 | # 12.2 |
1555 | # NOTE: <a> is not allowed directly, but this list of allowed |
1556 | # attributes is used from the Parser object |
1557 | 'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa |
1558 | |
1559 | # 13.2 |
1560 | # Not usually allowed, but may be used for extension-style hooks |
1561 | # such as <math> when it is rasterized |
1562 | 'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ), |
1563 | # Attributes for A/V tags added in T163583 / T133673 |
1564 | 'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ), |
1565 | 'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ), |
1566 | 'source' => $merge( $common, [ 'type', 'src' ] ), |
1567 | 'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ), |
1568 | |
1569 | # 15.2.1 |
1570 | 'tt' => $common, |
1571 | 'b' => $common, |
1572 | 'i' => $common, |
1573 | 'big' => $common, |
1574 | 'small' => $common, |
1575 | 'strike' => $common, |
1576 | 's' => $common, |
1577 | 'u' => $common, |
1578 | |
1579 | # 15.2.2 |
1580 | 'font' => $merge( $common, [ 'size', 'color', 'face' ] ), |
1581 | # basefont |
1582 | |
1583 | # 15.3 |
1584 | 'hr' => $merge( $common, [ 'width' ] ), |
1585 | |
1586 | # HTML Ruby annotation text module, simple ruby only. |
1587 | # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element |
1588 | 'ruby' => $common, |
1589 | # rbc |
1590 | 'rb' => $common, |
1591 | 'rp' => $common, |
1592 | 'rt' => $common, # $merge( $common, [ 'rbspan' ] ), |
1593 | 'rtc' => $common, |
1594 | |
1595 | # MathML root element, where used for extensions |
1596 | # 'title' may not be 100% valid here; it's XHTML |
1597 | # https://www.w3.org/TR/REC-MathML/ |
1598 | 'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ), |
1599 | |
1600 | // HTML 5 section 4.5 |
1601 | 'figure' => $common, |
1602 | 'figcaption' => $common, |
1603 | |
1604 | # HTML 5 section 4.6 |
1605 | 'bdi' => $common, |
1606 | |
1607 | # HTML5 elements, defined by: |
1608 | # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element |
1609 | 'data' => $merge( $common, [ 'value' ] ), |
1610 | 'time' => $merge( $common, [ 'datetime' ] ), |
1611 | 'mark' => $common, |
1612 | |
1613 | // meta and link are only permitted by internalRemoveHtmlTags when Microdata |
1614 | // is enabled so we don't bother adding a conditional to hide these |
1615 | // Also meta and link are only valid in WikiText as Microdata elements |
1616 | // (ie: validateTag rejects tags missing the attributes needed for Microdata) |
1617 | // So we don't bother including $common attributes that have no purpose. |
1618 | 'meta' => $merge( [], [ 'itemprop', 'content' ] ), |
1619 | 'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ), |
1620 | |
1621 | # HTML 5 section 4.3.5 |
1622 | 'aside' => $common, |
1623 | ]; |
1624 | |
1625 | return $allowed; |
1626 | } |
1627 | |
1628 | /** |
1629 | * Take a fragment of (potentially invalid) HTML and return |
1630 | * a version with any tags removed, encoded as plain text. |
1631 | * |
1632 | * Warning: this return value must be further escaped for literal |
1633 | * inclusion in HTML output as of 1.10! |
1634 | * |
1635 | * @param string $html HTML fragment |
1636 | * @return string |
1637 | * @return-taint tainted |
1638 | */ |
1639 | public static function stripAllTags( string $html ): string { |
1640 | // Use RemexHtml to tokenize $html and extract the text |
1641 | $handler = new RemexStripTagHandler; |
1642 | $tokenizer = new RemexTokenizer( $handler, $html, [ |
1643 | 'ignoreErrors' => true, |
1644 | // don't ignore char refs, we want them to be decoded |
1645 | 'ignoreNulls' => true, |
1646 | 'skipPreprocess' => true, |
1647 | ] ); |
1648 | $tokenizer->execute(); |
1649 | $text = $handler->getResult(); |
1650 | |
1651 | $text = self::normalizeWhitespace( $text ); |
1652 | return $text; |
1653 | } |
1654 | |
1655 | /** |
1656 | * Hack up a private DOCTYPE with HTML's standard entity declarations. |
1657 | * PHP 4 seemed to know these if you gave it an HTML doctype, but |
1658 | * PHP 5.1 doesn't. |
1659 | * |
1660 | * Use for passing XHTML fragments to PHP's XML parsing functions |
1661 | * |
1662 | * @deprecated since 1.36; will be made private or removed in a future |
1663 | * release. |
1664 | */ |
1665 | public static function hackDocType(): string { |
1666 | $out = "<!DOCTYPE html [\n"; |
1667 | foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) { |
1668 | if ( substr( $entity, -1 ) !== ';' ) { |
1669 | // Some HTML entities omit the trailing semicolon; |
1670 | // wikitext does not permit these. |
1671 | continue; |
1672 | } |
1673 | $name = substr( $entity, 0, -1 ); |
1674 | $expansion = self::normalizeEntity( $entity ); |
1675 | if ( $entity === $expansion ) { |
1676 | // Skip < > etc |
1677 | continue; |
1678 | } |
1679 | $out .= "<!ENTITY $name \"$expansion\">"; |
1680 | } |
1681 | $out .= "]>\n"; |
1682 | return $out; |
1683 | } |
1684 | |
1685 | public static function cleanUrl( string $url ): string { |
1686 | # Normalize any HTML entities in input. They will be |
1687 | # re-escaped by makeExternalLink(). |
1688 | $url = self::decodeCharReferences( $url ); |
1689 | |
1690 | # Escape any control characters introduced by the above step |
1691 | $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]+/', |
1692 | static fn ( $m ) => urlencode( $m[0] ), $url ); |
1693 | |
1694 | # Validate hostname portion |
1695 | $matches = []; |
1696 | if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { |
1697 | [ /* $whole */, $protocol, $host, $rest ] = $matches; |
1698 | |
1699 | // Characters that will be ignored in IDNs. |
1700 | // https://datatracker.ietf.org/doc/html/rfc8264#section-9.13 |
1701 | // https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt |
1702 | // Strip them before further processing so deny lists and such work. |
1703 | $strip = "/ |
1704 | \\s| # general whitespace |
1705 | \u{00AD}| # SOFT HYPHEN |
1706 | \u{034F}| # COMBINING GRAPHEME JOINER |
1707 | \u{061C}| # ARABIC LETTER MARK |
1708 | [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER.. |
1709 | # HANGUL JUNGSEONG FILLER |
1710 | [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ.. |
1711 | # KHMER VOWEL INHERENT AA |
1712 | [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE.. |
1713 | # MONGOLIAN FREE VARIATION SELECTOR THREE |
1714 | \u{180E}| # MONGOLIAN VOWEL SEPARATOR |
1715 | [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE.. |
1716 | # RIGHT-TO-LEFT MARK |
1717 | [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING.. |
1718 | # RIGHT-TO-LEFT OVERRIDE |
1719 | [\u{2060}-\u{2064}]| # WORD JOINER.. |
1720 | # INVISIBLE PLUS |
1721 | \u{2065}| # <reserved-2065> |
1722 | [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE.. |
1723 | # NOMINAL DIGIT SHAPES |
1724 | \u{3164}| # HANGUL FILLER |
1725 | [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1.. |
1726 | # VARIATION SELECTOR-16 |
1727 | \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE |
1728 | \u{FFA0}| # HALFWIDTH HANGUL FILLER |
1729 | [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>.. |
1730 | # <reserved-FFF8> |
1731 | [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP.. |
1732 | # SHORTHAND FORMAT UP STEP |
1733 | [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM.. |
1734 | # MUSICAL SYMBOL END PHRASE |
1735 | \u{E0000}| # <reserved-E0000> |
1736 | \u{E0001}| # LANGUAGE TAG |
1737 | [\u{E0002}-\u{E001F}]| # <reserved-E0002>.. |
1738 | # <reserved-E001F> |
1739 | [\u{E0020}-\u{E007F}]| # TAG SPACE.. |
1740 | # CANCEL TAG |
1741 | [\u{E0080}-\u{E00FF}]| # <reserved-E0080>.. |
1742 | # <reserved-E00FF> |
1743 | [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17.. |
1744 | # VARIATION SELECTOR-256 |
1745 | [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>.. |
1746 | # <reserved-E0FFF> |
1747 | /xuD"; |
1748 | |
1749 | $host = preg_replace( $strip, '', $host ); |
1750 | |
1751 | // IPv6 host names are bracketed with []. Url-decode these. |
1752 | if ( str_starts_with( $host, "//%5B" ) && |
1753 | preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches ) |
1754 | ) { |
1755 | $host = '//[' . $matches[1] . ']' . $matches[2]; |
1756 | } |
1757 | |
1758 | // @todo FIXME: Validate hostnames here |
1759 | |
1760 | return $protocol . $host . $rest; |
1761 | } else { |
1762 | return $url; |
1763 | } |
1764 | } |
1765 | |
1766 | /** |
1767 | * Does a string look like an e-mail address? |
1768 | * |
1769 | * This validates an email address using an HTML5 specification found at: |
1770 | * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address |
1771 | * Which as of 2011-01-24 says: |
1772 | * |
1773 | * A valid e-mail address is a string that matches the ABNF production |
1774 | * 1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined |
1775 | * in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section |
1776 | * 3.5. |
1777 | * |
1778 | * This function is an implementation of the specification as requested in |
1779 | * T24449. |
1780 | * |
1781 | * Client-side forms will use the same standard validation rules via JS or |
1782 | * HTML 5 validation; additional restrictions can be enforced server-side |
1783 | * by extensions via the 'isValidEmailAddr' hook. |
1784 | * |
1785 | * Note that this validation doesn't 100% match RFC 2822, but is believed |
1786 | * to be liberal enough for wide use. Some invalid addresses will still |
1787 | * pass validation here. |
1788 | * |
1789 | * @since 1.18 |
1790 | * |
1791 | * @param string $addr E-mail address |
1792 | * @return bool |
1793 | */ |
1794 | public static function validateEmail( string $addr ): bool { |
1795 | $result = null; |
1796 | // TODO This method should be non-static, and have a HookRunner injected |
1797 | $hookRunner = new HookRunner( MediaWikiServices::getInstance()->getHookContainer() ); |
1798 | if ( !$hookRunner->onIsValidEmailAddr( $addr, $result ) ) { |
1799 | return $result; |
1800 | } |
1801 | |
1802 | // Please note strings below are enclosed in brackets [], this make the |
1803 | // hyphen "-" a range indicator. Hence it is double backslashed below. |
1804 | // See T28948 |
1805 | $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~"; |
1806 | $rfc1034_ldh_str = "a-z0-9\\-"; |
1807 | |
1808 | $html5_email_regexp = "/ |
1809 | ^ # start of string |
1810 | [$rfc5322_atext\\.]+ # user part which is liberal :p |
1811 | @ # 'apostrophe' |
1812 | [$rfc1034_ldh_str]+ # First domain part |
1813 | (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot |
1814 | $ # End of string |
1815 | /ix"; // case Insensitive, eXtended |
1816 | |
1817 | return (bool)preg_match( $html5_email_regexp, $addr ); |
1818 | } |
1819 | } |
1820 | |
1821 | /** @deprecated class alias since 1.41 */ |
1822 | class_alias( Sanitizer::class, 'Sanitizer' ); |