Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
63.28% |
398 / 629 |
|
64.58% |
31 / 48 |
CRAP | |
0.00% |
0 / 1 |
Sanitizer | |
63.38% |
398 / 628 |
|
64.58% |
31 / 48 |
1943.80 | |
0.00% |
0 / 1 |
getAttribsRegex | |
18.18% |
2 / 11 |
|
0.00% |
0 / 1 |
4.19 | |||
getAttribNameRegex | |
40.00% |
2 / 5 |
|
0.00% |
0 / 1 |
2.86 | |||
getRecognizedTagData | |
40.00% |
24 / 60 |
|
0.00% |
0 / 1 |
21.82 | |||
internalRemoveHtmlTags | |
96.43% |
27 / 28 |
|
0.00% |
0 / 1 |
12 | |||
removeSomeTags | |
100.00% |
28 / 28 |
|
100.00% |
1 / 1 |
1 | |||
removeHTMLcomments | |
11.76% |
2 / 17 |
|
0.00% |
0 / 1 |
51.96 | |||
validateTag | |
77.78% |
7 / 9 |
|
0.00% |
0 / 1 |
8.70 | |||
validateTagAttributes | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
validateAttributes | |
91.30% |
42 / 46 |
|
0.00% |
0 / 1 |
36.85 | |||
isReservedDataAttribute | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
mergeAttributes | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
6 | |||
normalizeCss | |
55.56% |
10 / 18 |
|
0.00% |
0 / 1 |
5.40 | |||
checkCss | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
cssDecodeCallback | |
80.00% |
8 / 10 |
|
0.00% |
0 / 1 |
8.51 | |||
fixTagAttributes | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
3.03 | |||
encodeAttribute | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
armorFrenchSpaces | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
safeEncodeAttribute | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
1 | |||
escapeIdForAttribute | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
escapeIdForLink | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
escapeIdForExternalInterwiki | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
escapeIdInternalUrl | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
escapeIdInternal | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
4 | |||
escapeIdReferenceListInternal | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
escapeClass | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
escapeHtmlAllowEntities | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
decodeTagAttributes | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
5 | |||
safeEncodeTagAttributes | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
getTagAttributeCallback | |
88.89% |
8 / 9 |
|
0.00% |
0 / 1 |
5.03 | |||
normalizeWhitespace | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
normalizeSectionNameWhitespace | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
normalizeCharReferences | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
normalizeCharReferencesCallback | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
5 | |||
normalizeEntity | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
decCharReference | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
hexCharReference | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
validateCodepoint | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
10 | |||
decodeCharReferences | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
decodeCharReferencesAndNormalize | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
2 | |||
decodeCharReferencesCallback | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
5.03 | |||
decodeChar | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
decodeEntity | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 | |||
attributesAllowedInternal | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
setupAttributesAllowedInternal | |
2.26% |
3 / 133 |
|
0.00% |
0 / 1 |
5.74 | |||
stripAllTags | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
hackDocType | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
cleanUrl | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
4 | |||
validateEmail | |
91.67% |
11 / 12 |
|
0.00% |
0 / 1 |
2.00 |
1 | <?php |
2 | /** |
3 | * HTML sanitizer for %MediaWiki. |
4 | * |
5 | * Copyright © 2002-2005 Brooke Vibber <bvibber@wikimedia.org> et al |
6 | * https://www.mediawiki.org/ |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. |
12 | * |
13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU General Public License along |
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21 | * http://www.gnu.org/copyleft/gpl.html |
22 | * |
23 | * @file |
24 | * @ingroup Parser |
25 | */ |
26 | |
27 | namespace MediaWiki\Parser; |
28 | |
29 | use InvalidArgumentException; |
30 | use LogicException; |
31 | use MediaWiki\HookContainer\HookRunner; |
32 | use MediaWiki\MediaWikiServices; |
33 | use MediaWiki\Tidy\RemexCompatFormatter; |
34 | use StringUtils; |
35 | use UnexpectedValueException; |
36 | use Wikimedia\RemexHtml\HTMLData; |
37 | use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer; |
38 | use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer; |
39 | use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher; |
40 | use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder; |
41 | |
42 | /** |
43 | * HTML sanitizer for MediaWiki |
44 | * @ingroup Parser |
45 | */ |
46 | class Sanitizer { |
47 | /** |
48 | * Regular expression to match various types of character references in |
49 | * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences. |
50 | * Note that HTML5 allows some named entities to omit the trailing |
51 | * semicolon; wikitext entities *must* have a trailing semicolon. |
52 | */ |
53 | private const CHAR_REFS_REGEX = |
54 | '/&([A-Za-z0-9\x80-\xff]+;) |
55 | |&\#([0-9]+); |
56 | |&\#[xX]([0-9A-Fa-f]+); |
57 | |&/x'; |
58 | |
59 | /** |
60 | * Acceptable tag name charset from HTML5 parsing spec |
61 | * https://www.w3.org/TR/html5/syntax.html#tag-open-state |
62 | */ |
63 | private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!'; |
64 | |
65 | /** |
66 | * Pattern matching evil uris like javascript: |
67 | * WARNING: DO NOT use this in any place that actually requires denying |
68 | * certain URIs for security reasons. There are NUMEROUS[1] ways to bypass |
69 | * pattern-based deny lists; the only way to be secure from javascript: |
70 | * uri based xss vectors is to allow only things that you know are safe |
71 | * and deny everything else. |
72 | * [1]: http://ha.ckers.org/xss.html |
73 | */ |
74 | private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; |
75 | private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; |
76 | |
77 | /** |
78 | * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding. |
79 | * |
80 | * @since 1.30 |
81 | */ |
82 | public const ID_PRIMARY = 0; |
83 | |
84 | /** |
85 | * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false |
86 | * if no fallback is configured. |
87 | * |
88 | * @since 1.30 |
89 | */ |
90 | public const ID_FALLBACK = 1; |
91 | |
92 | /** |
93 | * Character entity aliases accepted by MediaWiki in wikitext. |
94 | * These are not part of the HTML standard. |
95 | */ |
96 | private const MW_ENTITY_ALIASES = [ |
97 | 'רלמ;' => 'rlm;', |
98 | 'رلم;' => 'rlm;', |
99 | ]; |
100 | |
101 | /** |
102 | * Lazy-initialised attributes regex, see getAttribsRegex() |
103 | */ |
104 | private static ?string $attribsRegex = null; |
105 | |
106 | /** |
107 | * Regular expression to match HTML/XML attribute pairs within a tag. |
108 | * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state |
109 | * Used in Sanitizer::decodeTagAttributes |
110 | */ |
111 | private static function getAttribsRegex(): string { |
112 | if ( self::$attribsRegex === null ) { |
113 | $spaceChars = '\x09\x0a\x0c\x0d\x20'; |
114 | $space = "[{$spaceChars}]"; |
115 | $attrib = "[^{$spaceChars}\/>=]"; |
116 | $attribFirst = "(?:{$attrib}|=)"; |
117 | self::$attribsRegex = |
118 | "/({$attribFirst}{$attrib}*) |
119 | ($space*=$space* |
120 | (?: |
121 | # The attribute value: quoted or alone |
122 | \"([^\"]*)(?:\"|\$) |
123 | | '([^']*)(?:'|\$) |
124 | | (((?!$space|>).)*) |
125 | ) |
126 | )?/sxu"; |
127 | } |
128 | return self::$attribsRegex; |
129 | } |
130 | |
131 | /** |
132 | * Lazy-initialised attribute name regex, see getAttribNameRegex() |
133 | */ |
134 | private static ?string $attribNameRegex = null; |
135 | |
136 | /** |
137 | * Used in Sanitizer::decodeTagAttributes to filter attributes. |
138 | */ |
139 | private static function getAttribNameRegex(): string { |
140 | if ( self::$attribNameRegex === null ) { |
141 | $attribFirst = "[:_\p{L}\p{N}]"; |
142 | $attrib = "[:_\.\-\p{L}\p{N}]"; |
143 | self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu"; |
144 | } |
145 | return self::$attribNameRegex; |
146 | } |
147 | |
148 | /** |
149 | * Return the various lists of recognized tags |
150 | * @param string[] $extratags For any extra tags to include |
151 | * @param string[] $removetags For any tags (default or extra) to exclude |
152 | * @return array |
153 | * @internal |
154 | */ |
155 | public static function getRecognizedTagData( array $extratags = [], array $removetags = [] ): array { |
156 | static $commonCase, $staticInitialised = false; |
157 | $isCommonCase = ( $extratags === [] && $removetags === [] ); |
158 | if ( $staticInitialised && $isCommonCase && $commonCase ) { |
159 | return $commonCase; |
160 | } |
161 | |
162 | static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, |
163 | $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic; |
164 | |
165 | if ( !$staticInitialised ) { |
166 | $htmlpairsStatic = [ # Tags that must be closed |
167 | 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', |
168 | 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', |
169 | 'strike', 'strong', 'tt', 'var', 'div', 'center', |
170 | 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', |
171 | 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn', |
172 | 'kbd', 'samp', 'data', 'time', 'mark' |
173 | ]; |
174 | # These tags can be self-closed. For tags not also on |
175 | # $htmlsingleonly, a self-closed tag will be emitted as |
176 | # an empty element (open-tag/close-tag pair). |
177 | $htmlsingle = [ |
178 | 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link' |
179 | ]; |
180 | |
181 | # Elements that cannot have close tags. This is (not coincidentally) |
182 | # also the list of tags for which the HTML 5 parsing algorithm |
183 | # requires you to "acknowledge the token's self-closing flag", i.e. |
184 | # a self-closing tag like <br/> is not an HTML 5 parse error only |
185 | # for this list. |
186 | $htmlsingleonly = [ |
187 | 'br', 'wbr', 'hr', 'meta', 'link' |
188 | ]; |
189 | |
190 | $htmlnest = [ # Tags that can be nested--?? |
191 | 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', |
192 | 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', |
193 | 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo' |
194 | ]; |
195 | $tabletags = [ # Can only appear inside table, we will close them |
196 | 'td', 'th', 'tr', |
197 | ]; |
198 | $htmllist = [ # Tags used by list |
199 | 'ul', 'ol', |
200 | ]; |
201 | $listtags = [ # Tags that can appear in a list |
202 | 'li', |
203 | ]; |
204 | |
205 | $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); |
206 | $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); |
207 | |
208 | # Convert them all to hashtables for faster lookup |
209 | $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', |
210 | 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ]; |
211 | foreach ( $vars as $var ) { |
212 | $$var = array_fill_keys( $$var, true ); |
213 | } |
214 | $staticInitialised = true; |
215 | } |
216 | |
217 | # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays |
218 | $extratags = array_fill_keys( $extratags, true ); |
219 | $removetags = array_fill_keys( $removetags, true ); |
220 | $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); |
221 | $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags ); |
222 | |
223 | $result = [ |
224 | 'htmlpairs' => $htmlpairs, |
225 | 'htmlsingle' => $htmlsingle, |
226 | 'htmlsingleonly' => $htmlsingleonly, |
227 | 'htmlnest' => $htmlnest, |
228 | 'tabletags' => $tabletags, |
229 | 'htmllist' => $htmllist, |
230 | 'listtags' => $listtags, |
231 | 'htmlsingleallowed' => $htmlsingleallowed, |
232 | 'htmlelements' => $htmlelements, |
233 | ]; |
234 | if ( $isCommonCase ) { |
235 | $commonCase = $result; |
236 | } |
237 | return $result; |
238 | } |
239 | |
240 | /** |
241 | * Cleans up HTML, removes dangerous tags and attributes, and |
242 | * removes HTML comments; BEWARE there may be unmatched HTML |
243 | * tags in the result. |
244 | * |
245 | * @note Callers are recommended to use `::removeSomeTags()` instead |
246 | * of this method. `Sanitizer::removeSomeTags()` is safer and will |
247 | * always return well-formed HTML; however, it is significantly |
248 | * slower (especially for short strings where setup costs |
249 | * predominate). This method is for internal use by the legacy parser |
250 | * where we know the result will be cleaned up in a subsequent tidy pass. |
251 | * |
252 | * @param string $text Original string; see T268353 for why untainted. |
253 | * @param-taint $text none |
254 | * @param callable|null $processCallback Callback to do any variable or |
255 | * parameter replacements in HTML attribute values. |
256 | * This argument should be considered @internal. |
257 | * @param-taint $processCallback exec_shell |
258 | * @param array|bool $args Arguments for the processing callback |
259 | * @param-taint $args none |
260 | * @param array $extratags For any extra tags to include |
261 | * @param-taint $extratags tainted |
262 | * @param array $removetags For any tags (default or extra) to exclude |
263 | * @param-taint $removetags none |
264 | * @return string |
265 | * @return-taint escaped |
266 | * @internal |
267 | */ |
268 | public static function internalRemoveHtmlTags( string $text, ?callable $processCallback = null, |
269 | $args = [], array $extratags = [], array $removetags = [] |
270 | ): string { |
271 | $tagData = self::getRecognizedTagData( $extratags, $removetags ); |
272 | $htmlsingle = $tagData['htmlsingle']; |
273 | $htmlsingleonly = $tagData['htmlsingleonly']; |
274 | $htmlelements = $tagData['htmlelements']; |
275 | |
276 | # Remove HTML comments |
277 | $text = self::removeHTMLcomments( $text ); |
278 | $bits = explode( '<', $text ); |
279 | $text = str_replace( '>', '>', array_shift( $bits ) ); |
280 | |
281 | # this might be possible using remex tidy itself |
282 | foreach ( $bits as $x ) { |
283 | if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) { |
284 | [ /* $qbar */, $slash, $t, $params, $brace, $rest ] = $regs; |
285 | |
286 | $badtag = false; |
287 | $t = strtolower( $t ); |
288 | if ( isset( $htmlelements[$t] ) ) { |
289 | if ( is_callable( $processCallback ) ) { |
290 | call_user_func_array( $processCallback, [ &$params, $args ] ); |
291 | } |
292 | |
293 | if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) { |
294 | // Remove the self-closing slash, to be consistent |
295 | // with HTML5 semantics. T134423 |
296 | $brace = '>'; |
297 | } |
298 | if ( !self::validateTag( $params, $t ) ) { |
299 | $badtag = true; |
300 | } |
301 | |
302 | $newparams = self::fixTagAttributes( $params, $t ); |
303 | if ( !$badtag ) { |
304 | if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) { |
305 | # Interpret self-closing tags as empty tags even when |
306 | # HTML 5 would interpret them as start tags. Such input |
307 | # is commonly seen on Wikimedia wikis with this intention. |
308 | $brace = "></$t>"; |
309 | } |
310 | |
311 | $rest = str_replace( '>', '>', $rest ); |
312 | $text .= "<$slash$t$newparams$brace$rest"; |
313 | continue; |
314 | } |
315 | } |
316 | } |
317 | $text .= '<' . str_replace( '>', '>', $x ); |
318 | } |
319 | return $text; |
320 | } |
321 | |
322 | /** |
323 | * Cleans up HTML, removes dangerous tags and attributes, and |
324 | * removes HTML comments; the result will always be balanced and |
325 | * tidy HTML. |
326 | * @param string $text Source string; see T268353 for why untainted |
327 | * @param-taint $text none |
328 | * @param array $options Options controlling the cleanup: |
329 | * string[] $options['extraTags'] Any extra tags to allow |
330 | * (This property taints the whole array.) |
331 | * string[] $options['removeTags'] Any tags (default or extra) to exclude |
332 | * callable(Attributes,...):Attributes $options['attrCallback'] Callback |
333 | * to do any variable or parameter replacements in HTML attribute |
334 | * values before further cleanup; should be considered @internal |
335 | * and not for external use. |
336 | * array $options['attrCallbackArgs'] Additional arguments for the |
337 | * attribute callback |
338 | * @param-taint $options tainted |
339 | * @return string The cleaned up HTML |
340 | * @return-taint escaped |
341 | * @since 1.38 |
342 | */ |
343 | public static function removeSomeTags( |
344 | string $text, array $options = [] |
345 | ): string { |
346 | $extraTags = $options['extraTags'] ?? []; |
347 | $removeTags = $options['removeTags'] ?? []; |
348 | // These options are @internal: |
349 | $attrCallback = $options['attrCallback'] ?? null; |
350 | $attrCallbackArgs = $options['attrCallbackArgs'] ?? []; |
351 | |
352 | // This disallows HTML5-style "missing trailing semicolon" attributes |
353 | // In wikitext "clean©" does *not* contain an entity. |
354 | $text = self::normalizeCharReferences( $text ); |
355 | |
356 | $tagData = self::getRecognizedTagData( $extraTags, $removeTags ); |
357 | // Use RemexHtml to tokenize $text and remove the barred tags |
358 | $formatter = new RemexCompatFormatter; |
359 | $serializer = new RemexSerializer( $formatter ); |
360 | $treeBuilder = new RemexTreeBuilder( $serializer, [ |
361 | 'ignoreErrors' => true, |
362 | 'ignoreNulls' => true, |
363 | ] ); |
364 | $dispatcher = new RemexDispatcher( $treeBuilder ); |
365 | $tokenHandler = $dispatcher; |
366 | $remover = new RemexRemoveTagHandler( |
367 | $tokenHandler, $text, $tagData, |
368 | $attrCallback, $attrCallbackArgs |
369 | ); |
370 | $tokenizer = new RemexTokenizer( $remover, $text, [ |
371 | 'ignoreErrors' => true, |
372 | // don't ignore char refs, we want them to be decoded |
373 | 'ignoreNulls' => true, |
374 | 'skipPreprocess' => true, |
375 | ] ); |
376 | $tokenizer->execute( [ |
377 | 'fragmentNamespace' => HTMLData::NS_HTML, |
378 | 'fragmentName' => 'body', |
379 | ] ); |
380 | return $serializer->getResult(); |
381 | } |
382 | |
383 | /** |
384 | * Remove '<!--', '-->', and everything between. |
385 | * To avoid leaving blank lines, when a comment is both preceded |
386 | * and followed by a newline (ignoring spaces), trim leading and |
387 | * trailing spaces and one of the newlines. |
388 | */ |
389 | public static function removeHTMLcomments( string $text ): string { |
390 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition |
391 | while ( ( $start = strpos( $text, '<!--' ) ) !== false ) { |
392 | $end = strpos( $text, '-->', $start + 4 ); |
393 | if ( $end === false ) { |
394 | # Unterminated comment; bail out |
395 | break; |
396 | } |
397 | |
398 | $end += 3; |
399 | |
400 | # Trim space and newline if the comment is both |
401 | # preceded and followed by a newline |
402 | $spaceStart = max( $start - 1, 0 ); |
403 | $spaceLen = $end - $spaceStart; |
404 | while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) { |
405 | $spaceStart--; |
406 | $spaceLen++; |
407 | } |
408 | while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) { |
409 | $spaceLen++; |
410 | } |
411 | if ( substr( $text, $spaceStart, 1 ) === "\n" |
412 | && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) { |
413 | # Remove the comment, leading and trailing |
414 | # spaces, and leave only one newline. |
415 | $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 ); |
416 | } else { |
417 | # Remove just the comment. |
418 | $text = substr_replace( $text, '', $start, $end - $start ); |
419 | } |
420 | } |
421 | return $text; |
422 | } |
423 | |
424 | /** |
425 | * Takes attribute names and values for a tag and the tag name and |
426 | * validates that the tag is allowed to be present. |
427 | * This DOES NOT validate the attributes, nor does it validate the |
428 | * tags themselves. This method only handles the special circumstances |
429 | * where we may want to allow a tag within content but ONLY when it has |
430 | * specific attributes set. |
431 | * |
432 | * @see RemexRemoveTagHandler::validateTag() |
433 | */ |
434 | private static function validateTag( string $params, string $element ): bool { |
435 | $params = self::decodeTagAttributes( $params ); |
436 | |
437 | if ( $element == 'meta' || $element == 'link' ) { |
438 | if ( !isset( $params['itemprop'] ) ) { |
439 | // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content |
440 | return false; |
441 | } |
442 | if ( $element == 'meta' && !isset( $params['content'] ) ) { |
443 | // <meta> must have a content="" for the itemprop |
444 | return false; |
445 | } |
446 | if ( $element == 'link' && !isset( $params['href'] ) ) { |
447 | // <link> must have an associated href="" |
448 | return false; |
449 | } |
450 | } |
451 | |
452 | return true; |
453 | } |
454 | |
455 | /** |
456 | * Take an array of attribute names and values and normalize or discard |
457 | * illegal values for the given element type. |
458 | * |
459 | * - Discards attributes not allowed for the given element |
460 | * - Unsafe style attributes are discarded |
461 | * - Invalid id attributes are re-encoded |
462 | * |
463 | * @todo Check for legal values where the DTD limits things. |
464 | * @todo Check for unique id attribute :P |
465 | */ |
466 | public static function validateTagAttributes( array $attribs, string $element ): array { |
467 | return self::validateAttributes( $attribs, |
468 | self::attributesAllowedInternal( $element ) ); |
469 | } |
470 | |
471 | /** |
472 | * Take an array of attribute names and values and normalize or discard |
473 | * illegal values. |
474 | * |
475 | * - Discards attributes not on the given list |
476 | * - Unsafe style attributes are discarded |
477 | * - Invalid id attributes are re-encoded |
478 | * |
479 | * @param array $attribs |
480 | * @param array $allowed List of allowed attribute names, |
481 | * as an associative array where keys give valid attribute names |
482 | * (since 1.34). Before 1.35, passing a sequential array of |
483 | * valid attribute names was permitted but that is now deprecated. |
484 | * @return array |
485 | * |
486 | * @todo Check for legal values where the DTD limits things. |
487 | * @todo Check for unique id attribute :P |
488 | */ |
489 | public static function validateAttributes( array $attribs, array $allowed ): array { |
490 | if ( isset( $allowed[0] ) ) { |
491 | // Calling this function with a sequential array is |
492 | // deprecated. For now just convert it. |
493 | wfDeprecated( __METHOD__ . ' with sequential array', '1.35' ); |
494 | $allowed = array_fill_keys( $allowed, true ); |
495 | } |
496 | $validProtocols = MediaWikiServices::getInstance()->getUrlUtils()->validProtocols(); |
497 | $hrefExp = '/^(' . $validProtocols . ')[^\s]+$/'; |
498 | |
499 | $out = []; |
500 | foreach ( $attribs as $attribute => $value ) { |
501 | # Allow XML namespace declaration to allow RDFa |
502 | if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) { |
503 | if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) { |
504 | $out[$attribute] = $value; |
505 | } |
506 | |
507 | continue; |
508 | } |
509 | |
510 | # Allow any attribute beginning with "data-" |
511 | # However: |
512 | # * Disallow data attributes used by MediaWiki code |
513 | # * Ensure that the attribute is not namespaced by banning |
514 | # colons. |
515 | if ( ( |
516 | !preg_match( '/^data-[^:]*$/i', $attribute ) && |
517 | !array_key_exists( $attribute, $allowed ) |
518 | ) || self::isReservedDataAttribute( $attribute ) ) { |
519 | continue; |
520 | } |
521 | |
522 | # Strip javascript "expression" from stylesheets. |
523 | # https://msdn.microsoft.com/en-us/library/ms537634.aspx |
524 | if ( $attribute == 'style' ) { |
525 | $value = self::checkCss( $value ); |
526 | } |
527 | |
528 | # Escape HTML id attributes |
529 | if ( $attribute === 'id' ) { |
530 | $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY ); |
531 | if ( $value === false || $value === '' ) { |
532 | continue; |
533 | } |
534 | } |
535 | |
536 | # Escape HTML id reference lists |
537 | if ( $attribute === 'aria-describedby' |
538 | || $attribute === 'aria-flowto' |
539 | || $attribute === 'aria-labelledby' |
540 | || $attribute === 'aria-owns' |
541 | ) { |
542 | $value = self::escapeIdReferenceListInternal( $value ); |
543 | } |
544 | |
545 | // RDFa and microdata properties allow URLs, URIs and/or CURIs. |
546 | if ( $attribute === 'rel' || $attribute === 'rev' |
547 | # RDFa |
548 | || $attribute === 'about' || $attribute === 'property' |
549 | || $attribute === 'resource' || $attribute === 'datatype' |
550 | || $attribute === 'typeof' |
551 | # HTML5 microdata |
552 | || $attribute === 'itemid' || $attribute === 'itemprop' |
553 | || $attribute === 'itemref' || $attribute === 'itemscope' |
554 | || $attribute === 'itemtype' |
555 | ) { |
556 | // Paranoia. Allow "simple" values but suppress javascript |
557 | if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) { |
558 | continue; |
559 | } |
560 | } |
561 | |
562 | # NOTE: even though elements using href/src are not allowed directly, supply |
563 | # validation code that can be used by tag hook handlers, etc |
564 | if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) { |
565 | if ( !preg_match( $hrefExp, $value ) ) { |
566 | continue; // drop any href or src attributes not using an allowed protocol. |
567 | // NOTE: this also drops all relative URLs |
568 | } |
569 | } |
570 | |
571 | if ( $attribute === 'tabindex' && $value !== '0' ) { |
572 | // Only allow tabindex of 0, which is useful for accessibility. |
573 | continue; |
574 | } |
575 | |
576 | // If this attribute was previously set, override it. |
577 | // Output should only have one attribute of each name. |
578 | $out[$attribute] = $value; |
579 | } |
580 | |
581 | # itemtype, itemid, itemref don't make sense without itemscope |
582 | if ( !array_key_exists( 'itemscope', $out ) ) { |
583 | unset( $out['itemtype'] ); |
584 | unset( $out['itemid'] ); |
585 | unset( $out['itemref'] ); |
586 | } |
587 | # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref. |
588 | |
589 | return $out; |
590 | } |
591 | |
592 | /** |
593 | * Given an attribute name, checks whether it is a reserved data attribute |
594 | * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki |
595 | * core and extension code can safely use it to communicate with frontend code. |
596 | * @param string $attr Attribute name. |
597 | * @return bool |
598 | */ |
599 | public static function isReservedDataAttribute( string $attr ): bool { |
600 | // data-ooui is reserved for ooui. |
601 | // data-mw and data-parsoid are reserved for parsoid. |
602 | // data-mw-<name here> is reserved for extensions (or core) if |
603 | // they need to communicate some data to the client and want to be |
604 | // sure that it isn't coming from an untrusted user. |
605 | // We ignore the possibility of namespaces since user-generated HTML |
606 | // can't use them anymore. |
607 | return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr ); |
608 | } |
609 | |
610 | /** |
611 | * Merge two sets of HTML attributes. Conflicting items in the second set |
612 | * will override those in the first, except for 'class' attributes which |
613 | * will be combined (if they're both strings). |
614 | * |
615 | * @todo implement merging for other attributes such as style |
616 | */ |
617 | public static function mergeAttributes( array $a, array $b ): array { |
618 | $out = array_merge( $a, $b ); |
619 | if ( isset( $a['class'] ) && isset( $b['class'] ) |
620 | && is_string( $a['class'] ) && is_string( $b['class'] ) |
621 | && $a['class'] !== $b['class'] |
622 | ) { |
623 | $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}", |
624 | -1, PREG_SPLIT_NO_EMPTY ); |
625 | $out['class'] = implode( ' ', array_unique( $classes ) ); |
626 | } |
627 | return $out; |
628 | } |
629 | |
630 | /** |
631 | * Normalize CSS into a format we can easily search for hostile input |
632 | * - decode character references |
633 | * - decode escape sequences |
634 | * - remove comments, unless the entire value is one single comment |
635 | * @param string $value the css string |
636 | * @return string normalized css |
637 | */ |
638 | public static function normalizeCss( string $value ): string { |
639 | // Decode character references like { |
640 | $value = self::decodeCharReferences( $value ); |
641 | |
642 | // Decode escape sequences and line continuation |
643 | // See the grammar in the CSS 2 spec, appendix D. |
644 | // This has to be done AFTER decoding character references. |
645 | // This means it isn't possible for this function to return |
646 | // unsanitized escape sequences. It is possible to manufacture |
647 | // input that contains character references that decode to |
648 | // escape sequences that decode to character references, but |
649 | // it's OK for the return value to contain character references |
650 | // because the caller is supposed to escape those anyway. |
651 | static $decodeRegex; |
652 | if ( !$decodeRegex ) { |
653 | $space = '[\\x20\\t\\r\\n\\f]'; |
654 | $nl = '(?:\\n|\\r\\n|\\r|\\f)'; |
655 | $backslash = '\\\\'; |
656 | $decodeRegex = "/ $backslash |
657 | (?: |
658 | ($nl) | # 1. Line continuation |
659 | ([0-9A-Fa-f]{1,6})$space? | # 2. character number |
660 | (.) | # 3. backslash cancelling special meaning |
661 | () | # 4. backslash at end of string |
662 | )/xu"; |
663 | } |
664 | $value = preg_replace_callback( $decodeRegex, |
665 | [ __CLASS__, 'cssDecodeCallback' ], $value ); |
666 | |
667 | // Let the value through if it's nothing but a single comment, to |
668 | // allow other functions which may reject it to pass some error |
669 | // message through. |
670 | if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) { |
671 | // Remove any comments; IE gets token splitting wrong |
672 | // This must be done AFTER decoding character references and |
673 | // escape sequences, because those steps can introduce comments |
674 | // This step cannot introduce character references or escape |
675 | // sequences, because it replaces comments with spaces rather |
676 | // than removing them completely. |
677 | $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); |
678 | |
679 | // Remove anything after a comment-start token, to guard against |
680 | // incorrect client implementations. |
681 | $commentPos = strpos( $value, '/*' ); |
682 | if ( $commentPos !== false ) { |
683 | $value = substr( $value, 0, $commentPos ); |
684 | } |
685 | } |
686 | |
687 | return $value; |
688 | } |
689 | |
690 | /** |
691 | * Pick apart some CSS and check it for forbidden or unsafe structures. |
692 | * Returns a sanitized string. This sanitized string will have |
693 | * character references and escape sequences decoded and comments |
694 | * stripped (unless it is itself one valid comment, in which case the value |
695 | * will be passed through). If the input is just too evil, only a comment |
696 | * complaining about evilness will be returned. |
697 | * |
698 | * Currently URL references, 'expression', 'tps' are forbidden. |
699 | * |
700 | * NOTE: Despite the fact that character references are decoded, the |
701 | * returned string may contain character references given certain |
702 | * clever input strings. These character references must |
703 | * be escaped before the return value is embedded in HTML. |
704 | * |
705 | * @warning This method is intended to sanitize style attributes on |
706 | * html tags only. It is not safe to use on full CSS files. |
707 | * @param string $value |
708 | * @return string |
709 | */ |
710 | public static function checkCss( $value ) { |
711 | $value = self::normalizeCss( $value ); |
712 | |
713 | // Reject problematic keywords and control characters |
714 | if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) || |
715 | strpos( $value, \UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) { |
716 | return '/* invalid control char */'; |
717 | } elseif ( preg_match( |
718 | '! expression |
719 | | accelerator\s*: |
720 | | -o-link\s*: |
721 | | -o-link-source\s*: |
722 | | -o-replace\s*: |
723 | | url\s*\( |
724 | | src\s*\( |
725 | | image\s*\( |
726 | | image-set\s*\( |
727 | | attr\s*\([^)]+[\s,]+url |
728 | !ix', $value ) ) { |
729 | return '/* insecure input */'; |
730 | } |
731 | return $value; |
732 | } |
733 | |
734 | private static function cssDecodeCallback( array $matches ): string { |
735 | if ( $matches[1] !== '' ) { |
736 | // Line continuation |
737 | return ''; |
738 | } elseif ( $matches[2] !== '' ) { |
739 | # hexdec could return a float if the match is too long, but the |
740 | # regexp in question limits the string length to 6. |
741 | $char = \UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) ); |
742 | } elseif ( $matches[3] !== '' ) { |
743 | $char = $matches[3]; |
744 | } else { |
745 | $char = '\\'; |
746 | } |
747 | if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { |
748 | // These characters need to be escaped in strings |
749 | // Clean up the escape sequence to avoid parsing errors by clients |
750 | return '\\' . dechex( ord( $char ) ) . ' '; |
751 | } else { |
752 | // Decode unnecessary escape |
753 | return $char; |
754 | } |
755 | } |
756 | |
757 | /** |
758 | * Take a tag soup fragment listing an HTML element's attributes |
759 | * and normalize it to well-formed XML, discarding unwanted attributes. |
760 | * Output is safe for further wikitext processing, with escaping of |
761 | * values that could trigger problems. |
762 | * |
763 | * - Normalizes attribute names to lowercase |
764 | * - Discards attributes not allowed for the given element |
765 | * - Turns broken or invalid entities into plaintext |
766 | * - Double-quotes all attribute values |
767 | * - Attributes without values are given the name as attribute |
768 | * - Double attributes are discarded |
769 | * - Unsafe style attributes are discarded |
770 | * - Prepends space if there are attributes. |
771 | * - (Optionally) Sorts attributes by name. |
772 | * |
773 | * @param string $text |
774 | * @param string $element |
775 | * @param bool $sorted Whether to sort the attributes (default: false) |
776 | * @return string |
777 | */ |
778 | public static function fixTagAttributes( string $text, string $element, bool $sorted = false ): string { |
779 | if ( trim( $text ) == '' ) { |
780 | return ''; |
781 | } |
782 | |
783 | $decoded = self::decodeTagAttributes( $text ); |
784 | $stripped = self::validateTagAttributes( $decoded, $element ); |
785 | |
786 | if ( $sorted ) { |
787 | ksort( $stripped ); |
788 | } |
789 | |
790 | return self::safeEncodeTagAttributes( $stripped ); |
791 | } |
792 | |
793 | /** |
794 | * Encode an attribute value for HTML output. |
795 | * @param string $text |
796 | * @param-taint $text escapes_html |
797 | * @return string HTML-encoded text fragment |
798 | * @return-taint escaped |
799 | */ |
800 | public static function encodeAttribute( string $text ): string { |
801 | $encValue = htmlspecialchars( $text, ENT_QUOTES ); |
802 | |
803 | // Whitespace is normalized during attribute decoding, |
804 | // so if we've been passed non-spaces we must encode them |
805 | // ahead of time or they won't be preserved. |
806 | $encValue = strtr( $encValue, [ |
807 | "\n" => ' ', |
808 | "\r" => ' ', |
809 | "\t" => '	', |
810 | ] ); |
811 | |
812 | return $encValue; |
813 | } |
814 | |
815 | /** |
816 | * Armor French spaces with a replacement character |
817 | * |
818 | * @since 1.32 |
819 | * @param string $text Text to armor |
820 | * @param string $space Space character for the French spaces, defaults to ' ' |
821 | * @return string Armored text |
822 | */ |
823 | public static function armorFrenchSpaces( string $text, string $space = ' ' ): string { |
824 | // Replace $ with \$ and \ with \\ |
825 | $space = preg_replace( '#(?<!\\\\)(\\$|\\\\)#', '\\\\$1', $space ); |
826 | $fixtags = [ |
827 | # French spaces, last one Guillemet-left |
828 | # only if it isn't followed by a word character. |
829 | '/ (?=[?:;!%»›](?!\w))/u' => "$space", |
830 | # French spaces, Guillemet-right |
831 | '/([«‹]) /u' => "\\1$space", |
832 | ]; |
833 | return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text ); |
834 | } |
835 | |
836 | /** |
837 | * Encode an attribute value for HTML tags, with extra armoring |
838 | * against further wiki processing. |
839 | * @param string $text |
840 | * @param-taint $text escapes_html |
841 | * @return string HTML-encoded text fragment |
842 | * @return-taint escaped |
843 | */ |
844 | public static function safeEncodeAttribute( string $text ): string { |
845 | $encValue = self::encodeAttribute( $text ); |
846 | |
847 | # Templates and links may be expanded in later parsing, |
848 | # creating invalid or dangerous output. Suppress this. |
849 | $encValue = strtr( $encValue, [ |
850 | // '<', '>', and '"' should never happen, as they indicate that we've received invalid input which should |
851 | // have been escaped. |
852 | '<' => '<', |
853 | '>' => '>', |
854 | '"' => '"', |
855 | '{' => '{', |
856 | '}' => '}', // prevent unpaired language conversion syntax |
857 | '[' => '[', |
858 | ']' => ']', |
859 | "''" => '''', |
860 | 'ISBN' => 'ISBN', |
861 | 'RFC' => 'RFC', |
862 | 'PMID' => 'PMID', |
863 | '|' => '|', |
864 | '__' => '__', |
865 | ] ); |
866 | |
867 | # Stupid hack |
868 | $validProtocols = MediaWikiServices::getInstance()->getUrlUtils()->validProtocols(); |
869 | $encValue = preg_replace_callback( |
870 | '/((?i)' . $validProtocols . ')/', |
871 | static function ( $matches ) { |
872 | return str_replace( ':', ':', $matches[1] ); |
873 | }, |
874 | $encValue ); |
875 | return $encValue; |
876 | } |
877 | |
878 | /** |
879 | * Given a section name or other user-generated or otherwise unsafe string, escapes it to be |
880 | * a valid HTML id attribute. |
881 | * |
882 | * WARNING: The output of this function is not guaranteed to be HTML safe, so be sure to use |
883 | * proper escaping. |
884 | * |
885 | * @param string $id String to escape |
886 | * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding |
887 | * should be used. |
888 | * @return string|false Escaped ID or false if fallback encoding is requested but it's not |
889 | * configured. |
890 | * |
891 | * @since 1.30 |
892 | */ |
893 | public static function escapeIdForAttribute( string $id, int $mode = self::ID_PRIMARY ) { |
894 | global $wgFragmentMode; |
895 | |
896 | if ( !isset( $wgFragmentMode[$mode] ) ) { |
897 | if ( $mode === self::ID_PRIMARY ) { |
898 | throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' ); |
899 | } |
900 | return false; |
901 | } |
902 | |
903 | $internalMode = $wgFragmentMode[$mode]; |
904 | |
905 | return self::escapeIdInternal( $id, $internalMode ); |
906 | } |
907 | |
908 | /** |
909 | * Given a section name or other user-generated or otherwise unsafe string, escapes it to be |
910 | * a valid URL fragment. |
911 | * |
912 | * WARNING: The output of this function is not guaranteed to be HTML safe, so be sure to use |
913 | * proper escaping. |
914 | * |
915 | * @param string $id String to escape |
916 | * @return string Escaped ID |
917 | * |
918 | * @since 1.30 |
919 | */ |
920 | public static function escapeIdForLink( string $id ): string { |
921 | global $wgFragmentMode; |
922 | |
923 | if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) { |
924 | throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' ); |
925 | } |
926 | |
927 | $mode = $wgFragmentMode[self::ID_PRIMARY]; |
928 | |
929 | $id = self::escapeIdInternalUrl( $id, $mode ); |
930 | |
931 | return $id; |
932 | } |
933 | |
934 | /** |
935 | * Given a section name or other user-generated or otherwise unsafe string, escapes it to be |
936 | * a valid URL fragment for external interwikis. |
937 | * |
938 | * @param string $id String to escape |
939 | * @return string Escaped ID |
940 | * |
941 | * @since 1.30 |
942 | */ |
943 | public static function escapeIdForExternalInterwiki( string $id ): string { |
944 | global $wgExternalInterwikiFragmentMode; |
945 | |
946 | $id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode ); |
947 | |
948 | return $id; |
949 | } |
950 | |
951 | /** |
952 | * Do percent encoding of percent signs for href (but not id) attributes |
953 | * |
954 | * @since 1.35 |
955 | * @see https://phabricator.wikimedia.org/T238385 |
956 | * @param string $id String to escape |
957 | * @param string $mode One of modes from $wgFragmentMode |
958 | * @return string |
959 | */ |
960 | private static function escapeIdInternalUrl( string $id, string $mode ): string { |
961 | $id = self::escapeIdInternal( $id, $mode ); |
962 | if ( $mode === 'html5' ) { |
963 | $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id ); |
964 | } |
965 | return $id; |
966 | } |
967 | |
968 | /** |
969 | * Helper for escapeIdFor*() functions. Performs most of the actual escaping. |
970 | * |
971 | * @param string $id String to escape |
972 | * @param string $mode One of modes from $wgFragmentMode |
973 | * @return string |
974 | */ |
975 | private static function escapeIdInternal( string $id, string $mode ): string { |
976 | // Truncate overly-long IDs. This isn't an HTML limit, it's just |
977 | // griefer protection. [T251506] |
978 | $id = mb_substr( $id, 0, 1024 ); |
979 | |
980 | switch ( $mode ) { |
981 | case 'html5': |
982 | // html5 spec says ids must not have any of the following: |
983 | // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE |
984 | // In practice, in wikitext, only tab, LF, CR (and SPACE) are |
985 | // possible using either Lua or html entities. |
986 | $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id ); |
987 | break; |
988 | case 'legacy': |
989 | // This corresponds to 'noninitial' mode of the former escapeId() |
990 | static $replace = [ |
991 | '%3A' => ':', |
992 | '%' => '.' |
993 | ]; |
994 | |
995 | $id = urlencode( str_replace( ' ', '_', $id ) ); |
996 | $id = strtr( $id, $replace ); |
997 | break; |
998 | default: |
999 | throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ ); |
1000 | } |
1001 | |
1002 | return $id; |
1003 | } |
1004 | |
1005 | /** |
1006 | * Given a string containing a space delimited list of ids, escape each id |
1007 | * to match ids escaped by the escapeIdForAttribute() function. |
1008 | * |
1009 | * @param string $referenceString Space delimited list of ids |
1010 | * @return string |
1011 | */ |
1012 | private static function escapeIdReferenceListInternal( string $referenceString ): string { |
1013 | # Explode the space delimited list string into an array of tokens |
1014 | $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY ); |
1015 | |
1016 | # Escape each token as an id |
1017 | foreach ( $references as &$ref ) { |
1018 | $ref = self::escapeIdForAttribute( $ref ); |
1019 | } |
1020 | |
1021 | # Merge the array back to a space delimited list string |
1022 | # If the array is empty, the result will be an empty string ('') |
1023 | $referenceString = implode( ' ', $references ); |
1024 | |
1025 | return $referenceString; |
1026 | } |
1027 | |
1028 | /** |
1029 | * Given a value, escape it so that it can be used as a CSS class and |
1030 | * return it. |
1031 | * |
1032 | * @todo For extra validity, input should be validated UTF-8. |
1033 | * |
1034 | * @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format |
1035 | */ |
1036 | public static function escapeClass( string $class ): string { |
1037 | // Convert ugly stuff to underscores and kill underscores in ugly places |
1038 | return rtrim( preg_replace( |
1039 | [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ], |
1040 | '_', |
1041 | $class ), '_' ); |
1042 | } |
1043 | |
1044 | /** |
1045 | * Given HTML input, escape with htmlspecialchars but un-escape entities. |
1046 | * This allows (generally harmless) entities like   to survive. |
1047 | * |
1048 | * @param string $html HTML to escape |
1049 | * @param-taint $html escapes_htmlnoent |
1050 | * @return string Escaped input |
1051 | * @return-taint escaped |
1052 | */ |
1053 | public static function escapeHtmlAllowEntities( string $html ): string { |
1054 | $html = self::decodeCharReferences( $html ); |
1055 | # It seems wise to escape ' as well as ", as a matter of course. Can't |
1056 | # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters |
1057 | # don't cause the entire string to disappear. |
1058 | $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE ); |
1059 | return $html; |
1060 | } |
1061 | |
1062 | /** |
1063 | * Return an associative array of attribute names and values from |
1064 | * a partial tag string. Attribute names are forced to lowercase, |
1065 | * character references are decoded to UTF-8 text. |
1066 | */ |
1067 | public static function decodeTagAttributes( string $text ): array { |
1068 | if ( trim( $text ) == '' ) { |
1069 | return []; |
1070 | } |
1071 | |
1072 | $pairs = []; |
1073 | if ( !preg_match_all( |
1074 | self::getAttribsRegex(), |
1075 | $text, |
1076 | $pairs, |
1077 | PREG_SET_ORDER ) ) { |
1078 | return []; |
1079 | } |
1080 | |
1081 | $attribs = []; |
1082 | foreach ( $pairs as $set ) { |
1083 | $attribute = strtolower( $set[1] ); |
1084 | |
1085 | // Filter attribute names with unacceptable characters |
1086 | if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) { |
1087 | continue; |
1088 | } |
1089 | |
1090 | $value = self::getTagAttributeCallback( $set ); |
1091 | |
1092 | // Normalize whitespace |
1093 | $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); |
1094 | $value = trim( $value ); |
1095 | |
1096 | // Decode character references |
1097 | $attribs[$attribute] = self::decodeCharReferences( $value ); |
1098 | } |
1099 | return $attribs; |
1100 | } |
1101 | |
1102 | /** |
1103 | * Build a partial tag string from an associative array of attribute |
1104 | * names and values as returned by decodeTagAttributes. |
1105 | */ |
1106 | public static function safeEncodeTagAttributes( array $assoc_array ): string { |
1107 | $attribs = []; |
1108 | foreach ( $assoc_array as $attribute => $value ) { |
1109 | $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT ); |
1110 | $encValue = self::safeEncodeAttribute( $value ); |
1111 | |
1112 | $attribs[] = "$encAttribute=\"$encValue\""; |
1113 | } |
1114 | return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; |
1115 | } |
1116 | |
1117 | /** |
1118 | * Pick the appropriate attribute value from a match set from the |
1119 | * attribs regex matches. |
1120 | */ |
1121 | private static function getTagAttributeCallback( array $set ): string { |
1122 | if ( isset( $set[5] ) ) { |
1123 | # No quotes. |
1124 | return $set[5]; |
1125 | } elseif ( isset( $set[4] ) ) { |
1126 | # Single-quoted |
1127 | return $set[4]; |
1128 | } elseif ( isset( $set[3] ) ) { |
1129 | # Double-quoted |
1130 | return $set[3]; |
1131 | } elseif ( !isset( $set[2] ) ) { |
1132 | # In XHTML, attributes must have a value so return an empty string. |
1133 | # See "Empty attribute syntax", |
1134 | # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name |
1135 | return ""; |
1136 | } else { |
1137 | throw new LogicException( "Tag conditions not met. This should never happen and is a bug." ); |
1138 | } |
1139 | } |
1140 | |
1141 | private static function normalizeWhitespace( string $text ): string { |
1142 | return trim( preg_replace( |
1143 | '/(?:\r\n|[\x20\x0d\x0a\x09])+/', |
1144 | ' ', |
1145 | $text ) ); |
1146 | } |
1147 | |
1148 | /** |
1149 | * Normalizes whitespace in a section name, such as might be returned |
1150 | * by Parser::stripSectionName(), for use in the id's that are used for |
1151 | * section links. |
1152 | */ |
1153 | public static function normalizeSectionNameWhitespace( string $section ): string { |
1154 | return trim( preg_replace( '/[ _]+/', ' ', $section ) ); |
1155 | } |
1156 | |
1157 | /** |
1158 | * Ensure that any entities and character references are legal |
1159 | * for XML and XHTML specifically. Any stray bits will be |
1160 | * &-escaped to result in a valid text fragment. |
1161 | * |
1162 | * a. named char refs can only be < > & ", others are |
1163 | * numericized (this way we're well-formed even without a DTD) |
1164 | * b. any numeric char refs must be legal chars, not invalid or forbidden |
1165 | * c. use lower cased "&#x", not "&#X" |
1166 | * d. fix or reject non-valid attributes |
1167 | * |
1168 | * @internal |
1169 | */ |
1170 | public static function normalizeCharReferences( string $text ): string { |
1171 | return preg_replace_callback( |
1172 | self::CHAR_REFS_REGEX, |
1173 | [ self::class, 'normalizeCharReferencesCallback' ], |
1174 | $text, -1, $count, PREG_UNMATCHED_AS_NULL |
1175 | ); |
1176 | } |
1177 | |
1178 | private static function normalizeCharReferencesCallback( array $matches ): string { |
1179 | $ret = null; |
1180 | if ( isset( $matches[1] ) ) { |
1181 | $ret = self::normalizeEntity( $matches[1] ); |
1182 | } elseif ( isset( $matches[2] ) ) { |
1183 | $ret = self::decCharReference( $matches[2] ); |
1184 | } elseif ( isset( $matches[3] ) ) { |
1185 | $ret = self::hexCharReference( $matches[3] ); |
1186 | } |
1187 | if ( $ret === null ) { |
1188 | return htmlspecialchars( $matches[0], ENT_COMPAT ); |
1189 | } else { |
1190 | return $ret; |
1191 | } |
1192 | } |
1193 | |
1194 | /** |
1195 | * If the named entity is defined in HTML5 |
1196 | * return the equivalent numeric entity reference (except for the core < |
1197 | * > & "). If the entity is a MediaWiki-specific alias, returns |
1198 | * the HTML equivalent. Otherwise, returns HTML-escaped text of |
1199 | * pseudo-entity source (eg &foo;) |
1200 | * |
1201 | * @param string $name Semicolon-terminated name |
1202 | * @return string |
1203 | */ |
1204 | private static function normalizeEntity( string $name ): string { |
1205 | if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) { |
1206 | // Non-standard MediaWiki-specific entities |
1207 | return '&' . self::MW_ENTITY_ALIASES[$name]; |
1208 | } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) { |
1209 | // Keep these in word form |
1210 | return "&$name"; |
1211 | } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) { |
1212 | // Beware: some entities expand to more than 1 codepoint |
1213 | return preg_replace_callback( '/./Ssu', static function ( $m ) { |
1214 | return '&#' . \UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';'; |
1215 | }, HTMLData::$namedEntityTranslations[$name] ); |
1216 | } else { |
1217 | return "&$name"; |
1218 | } |
1219 | } |
1220 | |
1221 | private static function decCharReference( string $codepoint ): ?string { |
1222 | # intval() will (safely) saturate at the maximum signed integer |
1223 | # value if $codepoint is too many digits |
1224 | $point = intval( $codepoint ); |
1225 | if ( self::validateCodepoint( $point ) ) { |
1226 | return "&#$point;"; |
1227 | } else { |
1228 | return null; |
1229 | } |
1230 | } |
1231 | |
1232 | private static function hexCharReference( string $codepoint ): ?string { |
1233 | $point = hexdec( $codepoint ); |
1234 | // hexdec() might return a float if the string is too long |
1235 | if ( is_int( $point ) && self::validateCodepoint( $point ) ) { |
1236 | return sprintf( '&#x%x;', $point ); |
1237 | } else { |
1238 | return null; |
1239 | } |
1240 | } |
1241 | |
1242 | /** |
1243 | * Returns true if a given Unicode codepoint is a valid character in |
1244 | * both HTML5 and XML. |
1245 | */ |
1246 | private static function validateCodepoint( int $codepoint ): bool { |
1247 | # U+000C is valid in HTML5 but not allowed in XML. |
1248 | # U+000D is valid in XML but not allowed in HTML5. |
1249 | # U+007F - U+009F are disallowed in HTML5 (control characters). |
1250 | return $codepoint == 0x09 |
1251 | || $codepoint == 0x0a |
1252 | || ( $codepoint >= 0x20 && $codepoint <= 0x7e ) |
1253 | || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff ) |
1254 | || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd ) |
1255 | || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff ); |
1256 | } |
1257 | |
1258 | /** |
1259 | * Decode any character references, numeric or named entities, |
1260 | * in the text and return a UTF-8 string. |
1261 | */ |
1262 | public static function decodeCharReferences( string $text ): string { |
1263 | return preg_replace_callback( |
1264 | self::CHAR_REFS_REGEX, |
1265 | [ self::class, 'decodeCharReferencesCallback' ], |
1266 | $text, -1, $count, PREG_UNMATCHED_AS_NULL |
1267 | ); |
1268 | } |
1269 | |
1270 | /** |
1271 | * Decode any character references, numeric or named entities, |
1272 | * in the next and normalize the resulting string. (T16952) |
1273 | * |
1274 | * This is useful for page titles, not for text to be displayed, |
1275 | * MediaWiki allows HTML entities to escape normalization as a feature. |
1276 | * |
1277 | * @param string $text Already normalized, containing entities |
1278 | * @return string Still normalized, without entities |
1279 | */ |
1280 | public static function decodeCharReferencesAndNormalize( string $text ): string { |
1281 | $text = preg_replace_callback( |
1282 | self::CHAR_REFS_REGEX, |
1283 | [ self::class, 'decodeCharReferencesCallback' ], |
1284 | $text, -1, $count, PREG_UNMATCHED_AS_NULL |
1285 | ); |
1286 | |
1287 | if ( $count ) { |
1288 | return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text ); |
1289 | } else { |
1290 | return $text; |
1291 | } |
1292 | } |
1293 | |
1294 | private static function decodeCharReferencesCallback( array $matches ): string { |
1295 | if ( isset( $matches[1] ) ) { |
1296 | return self::decodeEntity( $matches[1] ); |
1297 | } elseif ( isset( $matches[2] ) ) { |
1298 | return self::decodeChar( intval( $matches[2] ) ); |
1299 | } elseif ( isset( $matches[3] ) ) { |
1300 | $point = hexdec( $matches[3] ); |
1301 | // hexdec() might return a float if the string is too long |
1302 | if ( !is_int( $point ) ) { |
1303 | // Invalid character reference. |
1304 | return \UtfNormal\Constants::UTF8_REPLACEMENT; |
1305 | } |
1306 | return self::decodeChar( $point ); |
1307 | } |
1308 | # Last case should be an ampersand by itself |
1309 | return $matches[0]; |
1310 | } |
1311 | |
1312 | /** |
1313 | * Return UTF-8 string for a codepoint if that is a valid |
1314 | * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. |
1315 | * @internal |
1316 | */ |
1317 | private static function decodeChar( int $codepoint ): string { |
1318 | if ( self::validateCodepoint( $codepoint ) ) { |
1319 | return \UtfNormal\Utils::codepointToUtf8( $codepoint ); |
1320 | } else { |
1321 | return \UtfNormal\Constants::UTF8_REPLACEMENT; |
1322 | } |
1323 | } |
1324 | |
1325 | /** |
1326 | * If the named entity is defined in HTML5 |
1327 | * return the UTF-8 encoding of that character. Otherwise, returns |
1328 | * pseudo-entity source (eg "&foo;") |
1329 | * |
1330 | * @param string $name Semicolon-terminated entity name |
1331 | * @return string |
1332 | */ |
1333 | private static function decodeEntity( string $name ): string { |
1334 | // These are MediaWiki-specific entities, not in the HTML standard |
1335 | if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) { |
1336 | $name = self::MW_ENTITY_ALIASES[$name]; |
1337 | } |
1338 | $trans = HTMLData::$namedEntityTranslations[$name] ?? null; |
1339 | return $trans ?? "&$name"; |
1340 | } |
1341 | |
1342 | /** |
1343 | * Fetch the list of acceptable attributes for a given element name. |
1344 | * |
1345 | * @param string $element |
1346 | * @return array An associative array where keys are acceptable attribute |
1347 | * names |
1348 | */ |
1349 | private static function attributesAllowedInternal( string $element ): array { |
1350 | $list = self::setupAttributesAllowedInternal(); |
1351 | return $list[$element] ?? []; |
1352 | } |
1353 | |
1354 | /** |
1355 | * Foreach array key (an allowed HTML element), return an array |
1356 | * of allowed attributes. |
1357 | * @return array An associative array: keys are HTML element names; |
1358 | * values are associative arrays where the keys are allowed attribute |
1359 | * names. |
1360 | */ |
1361 | private static function setupAttributesAllowedInternal(): array { |
1362 | static $allowed; |
1363 | |
1364 | if ( $allowed !== null ) { |
1365 | return $allowed; |
1366 | } |
1367 | |
1368 | // For lookup efficiency flip each attributes array so the keys are |
1369 | // the valid attributes. |
1370 | $merge = static function ( $a, $b, $c = [] ) { |
1371 | return array_merge( |
1372 | $a, |
1373 | array_fill_keys( $b, true ), |
1374 | array_fill_keys( $c, true ) ); |
1375 | }; |
1376 | $common = $merge( [], [ |
1377 | # HTML |
1378 | 'id', |
1379 | 'class', |
1380 | 'style', |
1381 | 'lang', |
1382 | 'dir', |
1383 | 'title', |
1384 | 'tabindex', |
1385 | |
1386 | # WAI-ARIA |
1387 | 'aria-describedby', |
1388 | 'aria-flowto', |
1389 | 'aria-hidden', |
1390 | 'aria-label', |
1391 | 'aria-labelledby', |
1392 | 'aria-level', |
1393 | 'aria-owns', |
1394 | 'role', |
1395 | |
1396 | # RDFa |
1397 | # These attributes are specified in section 9 of |
1398 | # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 |
1399 | 'about', |
1400 | 'property', |
1401 | 'resource', |
1402 | 'datatype', |
1403 | 'typeof', |
1404 | |
1405 | # Microdata. These are specified by |
1406 | # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model |
1407 | 'itemid', |
1408 | 'itemprop', |
1409 | 'itemref', |
1410 | 'itemscope', |
1411 | 'itemtype', |
1412 | ] ); |
1413 | |
1414 | $block = $merge( $common, [ 'align' ] ); |
1415 | |
1416 | $tablealign = [ 'align', 'valign' ]; |
1417 | $tablecell = [ |
1418 | 'abbr', |
1419 | 'axis', |
1420 | 'headers', |
1421 | 'scope', |
1422 | 'rowspan', |
1423 | 'colspan', |
1424 | 'nowrap', # deprecated |
1425 | 'width', # deprecated |
1426 | 'height', # deprecated |
1427 | 'bgcolor', # deprecated |
1428 | ]; |
1429 | |
1430 | # Numbers refer to sections in HTML 4.01 standard describing the element. |
1431 | # See: https://www.w3.org/TR/html4/ |
1432 | $allowed = [ |
1433 | # 7.5.4 |
1434 | 'div' => $block, |
1435 | 'center' => $common, # deprecated |
1436 | 'span' => $common, |
1437 | |
1438 | # 7.5.5 |
1439 | 'h1' => $block, |
1440 | 'h2' => $block, |
1441 | 'h3' => $block, |
1442 | 'h4' => $block, |
1443 | 'h5' => $block, |
1444 | 'h6' => $block, |
1445 | |
1446 | # 7.5.6 |
1447 | # address |
1448 | |
1449 | # 8.2.4 |
1450 | 'bdo' => $common, |
1451 | |
1452 | # 9.2.1 |
1453 | 'em' => $common, |
1454 | 'strong' => $common, |
1455 | 'cite' => $common, |
1456 | 'dfn' => $common, |
1457 | 'code' => $common, |
1458 | 'samp' => $common, |
1459 | 'kbd' => $common, |
1460 | 'var' => $common, |
1461 | 'abbr' => $common, |
1462 | # acronym |
1463 | |
1464 | # 9.2.2 |
1465 | 'blockquote' => $merge( $common, [ 'cite' ] ), |
1466 | 'q' => $merge( $common, [ 'cite' ] ), |
1467 | |
1468 | # 9.2.3 |
1469 | 'sub' => $common, |
1470 | 'sup' => $common, |
1471 | |
1472 | # 9.3.1 |
1473 | 'p' => $block, |
1474 | |
1475 | # 9.3.2 |
1476 | 'br' => $merge( $common, [ 'clear' ] ), |
1477 | |
1478 | # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element |
1479 | 'wbr' => $common, |
1480 | |
1481 | # 9.3.4 |
1482 | 'pre' => $merge( $common, [ 'width' ] ), |
1483 | |
1484 | # 9.4 |
1485 | 'ins' => $merge( $common, [ 'cite', 'datetime' ] ), |
1486 | 'del' => $merge( $common, [ 'cite', 'datetime' ] ), |
1487 | |
1488 | # 10.2 |
1489 | 'ul' => $merge( $common, [ 'type' ] ), |
1490 | 'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ), |
1491 | 'li' => $merge( $common, [ 'type', 'value' ] ), |
1492 | |
1493 | # 10.3 |
1494 | 'dl' => $common, |
1495 | 'dd' => $common, |
1496 | 'dt' => $common, |
1497 | |
1498 | # 11.2.1 |
1499 | 'table' => $merge( $common, |
1500 | [ 'summary', 'width', 'border', 'frame', |
1501 | 'rules', 'cellspacing', 'cellpadding', |
1502 | 'align', 'bgcolor', |
1503 | ] ), |
1504 | |
1505 | # 11.2.2 |
1506 | 'caption' => $block, |
1507 | |
1508 | # 11.2.3 |
1509 | 'thead' => $common, |
1510 | 'tfoot' => $common, |
1511 | 'tbody' => $common, |
1512 | |
1513 | # 11.2.4 |
1514 | 'colgroup' => $merge( $common, [ 'span' ] ), |
1515 | 'col' => $merge( $common, [ 'span' ] ), |
1516 | |
1517 | # 11.2.5 |
1518 | 'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ), |
1519 | |
1520 | # 11.2.6 |
1521 | 'td' => $merge( $common, $tablecell, $tablealign ), |
1522 | 'th' => $merge( $common, $tablecell, $tablealign ), |
1523 | |
1524 | # 12.2 |
1525 | # NOTE: <a> is not allowed directly, but this list of allowed |
1526 | # attributes is used from the Parser object |
1527 | 'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa |
1528 | |
1529 | # 13.2 |
1530 | # Not usually allowed, but may be used for extension-style hooks |
1531 | # such as <math> when it is rasterized |
1532 | 'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ), |
1533 | # Attributes for A/V tags added in T163583 / T133673 |
1534 | 'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ), |
1535 | 'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ), |
1536 | 'source' => $merge( $common, [ 'type', 'src' ] ), |
1537 | 'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ), |
1538 | |
1539 | # 15.2.1 |
1540 | 'tt' => $common, |
1541 | 'b' => $common, |
1542 | 'i' => $common, |
1543 | 'big' => $common, |
1544 | 'small' => $common, |
1545 | 'strike' => $common, |
1546 | 's' => $common, |
1547 | 'u' => $common, |
1548 | |
1549 | # 15.2.2 |
1550 | 'font' => $merge( $common, [ 'size', 'color', 'face' ] ), |
1551 | # basefont |
1552 | |
1553 | # 15.3 |
1554 | 'hr' => $merge( $common, [ 'width' ] ), |
1555 | |
1556 | # HTML Ruby annotation text module, simple ruby only. |
1557 | # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element |
1558 | 'ruby' => $common, |
1559 | # rbc |
1560 | 'rb' => $common, |
1561 | 'rp' => $common, |
1562 | 'rt' => $common, # $merge( $common, [ 'rbspan' ] ), |
1563 | 'rtc' => $common, |
1564 | |
1565 | # MathML root element, where used for extensions |
1566 | # 'title' may not be 100% valid here; it's XHTML |
1567 | # https://www.w3.org/TR/REC-MathML/ |
1568 | 'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ), |
1569 | |
1570 | // HTML 5 section 4.5 |
1571 | 'figure' => $common, |
1572 | 'figcaption' => $common, |
1573 | |
1574 | # HTML 5 section 4.6 |
1575 | 'bdi' => $common, |
1576 | |
1577 | # HTML5 elements, defined by: |
1578 | # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element |
1579 | 'data' => $merge( $common, [ 'value' ] ), |
1580 | 'time' => $merge( $common, [ 'datetime' ] ), |
1581 | 'mark' => $common, |
1582 | |
1583 | // meta and link are only permitted by internalRemoveHtmlTags when Microdata |
1584 | // is enabled so we don't bother adding a conditional to hide these |
1585 | // Also meta and link are only valid in WikiText as Microdata elements |
1586 | // (ie: validateTag rejects tags missing the attributes needed for Microdata) |
1587 | // So we don't bother including $common attributes that have no purpose. |
1588 | 'meta' => $merge( [], [ 'itemprop', 'content' ] ), |
1589 | 'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ), |
1590 | |
1591 | # HTML 5 section 4.3.5 |
1592 | 'aside' => $common, |
1593 | ]; |
1594 | |
1595 | return $allowed; |
1596 | } |
1597 | |
1598 | /** |
1599 | * Take a fragment of (potentially invalid) HTML and return |
1600 | * a version with any tags removed, encoded as plain text. |
1601 | * |
1602 | * Warning: this return value must be further escaped for literal |
1603 | * inclusion in HTML output as of 1.10! |
1604 | * |
1605 | * @param string $html HTML fragment |
1606 | * @return string |
1607 | * @return-taint tainted |
1608 | */ |
1609 | public static function stripAllTags( string $html ): string { |
1610 | // Use RemexHtml to tokenize $html and extract the text |
1611 | $handler = new RemexStripTagHandler; |
1612 | $tokenizer = new RemexTokenizer( $handler, $html, [ |
1613 | 'ignoreErrors' => true, |
1614 | // don't ignore char refs, we want them to be decoded |
1615 | 'ignoreNulls' => true, |
1616 | 'skipPreprocess' => true, |
1617 | ] ); |
1618 | $tokenizer->execute(); |
1619 | $text = $handler->getResult(); |
1620 | |
1621 | $text = self::normalizeWhitespace( $text ); |
1622 | return $text; |
1623 | } |
1624 | |
1625 | /** |
1626 | * Hack up a private DOCTYPE with HTML's standard entity declarations. |
1627 | * PHP 4 seemed to know these if you gave it an HTML doctype, but |
1628 | * PHP 5.1 doesn't. |
1629 | * |
1630 | * Use for passing XHTML fragments to PHP's XML parsing functions |
1631 | * |
1632 | * @deprecated since 1.36; will be made private or removed in a future |
1633 | * release. |
1634 | */ |
1635 | public static function hackDocType(): string { |
1636 | $out = "<!DOCTYPE html [\n"; |
1637 | foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) { |
1638 | if ( substr( $entity, -1 ) !== ';' ) { |
1639 | // Some HTML entities omit the trailing semicolon; |
1640 | // wikitext does not permit these. |
1641 | continue; |
1642 | } |
1643 | $name = substr( $entity, 0, -1 ); |
1644 | $expansion = self::normalizeEntity( $entity ); |
1645 | if ( $entity === $expansion ) { |
1646 | // Skip < > etc |
1647 | continue; |
1648 | } |
1649 | $out .= "<!ENTITY $name \"$expansion\">"; |
1650 | } |
1651 | $out .= "]>\n"; |
1652 | return $out; |
1653 | } |
1654 | |
1655 | public static function cleanUrl( string $url ): string { |
1656 | # Normalize any HTML entities in input. They will be |
1657 | # re-escaped by makeExternalLink(). |
1658 | $url = self::decodeCharReferences( $url ); |
1659 | |
1660 | # Escape any control characters introduced by the above step |
1661 | $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]+/', |
1662 | static fn ( $m ) => urlencode( $m[0] ), $url ); |
1663 | |
1664 | # Validate hostname portion |
1665 | $matches = []; |
1666 | if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { |
1667 | [ /* $whole */, $protocol, $host, $rest ] = $matches; |
1668 | |
1669 | // Characters that will be ignored in IDNs. |
1670 | // https://datatracker.ietf.org/doc/html/rfc8264#section-9.13 |
1671 | // https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt |
1672 | // Strip them before further processing so deny lists and such work. |
1673 | $strip = "/ |
1674 | \\s| # general whitespace |
1675 | \u{00AD}| # SOFT HYPHEN |
1676 | \u{034F}| # COMBINING GRAPHEME JOINER |
1677 | \u{061C}| # ARABIC LETTER MARK |
1678 | [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER.. |
1679 | # HANGUL JUNGSEONG FILLER |
1680 | [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ.. |
1681 | # KHMER VOWEL INHERENT AA |
1682 | [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE.. |
1683 | # MONGOLIAN FREE VARIATION SELECTOR THREE |
1684 | \u{180E}| # MONGOLIAN VOWEL SEPARATOR |
1685 | [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE.. |
1686 | # RIGHT-TO-LEFT MARK |
1687 | [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING.. |
1688 | # RIGHT-TO-LEFT OVERRIDE |
1689 | [\u{2060}-\u{2064}]| # WORD JOINER.. |
1690 | # INVISIBLE PLUS |
1691 | \u{2065}| # <reserved-2065> |
1692 | [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE.. |
1693 | # NOMINAL DIGIT SHAPES |
1694 | \u{3164}| # HANGUL FILLER |
1695 | [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1.. |
1696 | # VARIATION SELECTOR-16 |
1697 | \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE |
1698 | \u{FFA0}| # HALFWIDTH HANGUL FILLER |
1699 | [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>.. |
1700 | # <reserved-FFF8> |
1701 | [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP.. |
1702 | # SHORTHAND FORMAT UP STEP |
1703 | [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM.. |
1704 | # MUSICAL SYMBOL END PHRASE |
1705 | \u{E0000}| # <reserved-E0000> |
1706 | \u{E0001}| # LANGUAGE TAG |
1707 | [\u{E0002}-\u{E001F}]| # <reserved-E0002>.. |
1708 | # <reserved-E001F> |
1709 | [\u{E0020}-\u{E007F}]| # TAG SPACE.. |
1710 | # CANCEL TAG |
1711 | [\u{E0080}-\u{E00FF}]| # <reserved-E0080>.. |
1712 | # <reserved-E00FF> |
1713 | [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17.. |
1714 | # VARIATION SELECTOR-256 |
1715 | [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>.. |
1716 | # <reserved-E0FFF> |
1717 | /xuD"; |
1718 | |
1719 | $host = preg_replace( $strip, '', $host ); |
1720 | |
1721 | // IPv6 host names are bracketed with []. Url-decode these. |
1722 | if ( str_starts_with( $host, "//%5B" ) && |
1723 | preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches ) |
1724 | ) { |
1725 | $host = '//[' . $matches[1] . ']' . $matches[2]; |
1726 | } |
1727 | |
1728 | // @todo FIXME: Validate hostnames here |
1729 | |
1730 | return $protocol . $host . $rest; |
1731 | } else { |
1732 | return $url; |
1733 | } |
1734 | } |
1735 | |
1736 | /** |
1737 | * Does a string look like an e-mail address? |
1738 | * |
1739 | * This validates an email address using an HTML5 specification found at: |
1740 | * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address |
1741 | * Which as of 2011-01-24 says: |
1742 | * |
1743 | * A valid e-mail address is a string that matches the ABNF production |
1744 | * 1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined |
1745 | * in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section |
1746 | * 3.5. |
1747 | * |
1748 | * This function is an implementation of the specification as requested in |
1749 | * T24449. |
1750 | * |
1751 | * Client-side forms will use the same standard validation rules via JS or |
1752 | * HTML 5 validation; additional restrictions can be enforced server-side |
1753 | * by extensions via the 'isValidEmailAddr' hook. |
1754 | * |
1755 | * Note that this validation doesn't 100% match RFC 2822, but is believed |
1756 | * to be liberal enough for wide use. Some invalid addresses will still |
1757 | * pass validation here. |
1758 | * |
1759 | * @since 1.18 |
1760 | * |
1761 | * @param string $addr E-mail address |
1762 | * @return bool |
1763 | */ |
1764 | public static function validateEmail( string $addr ): bool { |
1765 | $result = null; |
1766 | // TODO This method should be non-static, and have a HookRunner injected |
1767 | $hookRunner = new HookRunner( MediaWikiServices::getInstance()->getHookContainer() ); |
1768 | if ( !$hookRunner->onIsValidEmailAddr( $addr, $result ) ) { |
1769 | return $result; |
1770 | } |
1771 | |
1772 | // Please note strings below are enclosed in brackets [], this make the |
1773 | // hyphen "-" a range indicator. Hence it is double backslashed below. |
1774 | // See T28948 |
1775 | $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~"; |
1776 | $rfc1034_ldh_str = "a-z0-9\\-"; |
1777 | |
1778 | $html5_email_regexp = "/ |
1779 | ^ # start of string |
1780 | [$rfc5322_atext\\.]+ # user part which is liberal :p |
1781 | @ # 'apostrophe' |
1782 | [$rfc1034_ldh_str]+ # First domain part |
1783 | (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot |
1784 | $ # End of string |
1785 | /ix"; // case Insensitive, eXtended |
1786 | |
1787 | return (bool)preg_match( $html5_email_regexp, $addr ); |
1788 | } |
1789 | } |
1790 | |
1791 | /** @deprecated class alias since 1.41 */ |
1792 | class_alias( Sanitizer::class, 'Sanitizer' ); |