MediaWiki REL1_35
Sanitizer.php
Go to the documentation of this file.
1<?php
28
33class Sanitizer {
38 private const CHAR_REFS_REGEX =
39 '/&([A-Za-z0-9\x80-\xff]+);
40 |&\#([0-9]+);
41 |&\#[xX]([0-9A-Fa-f]+);
42 |(&)/x';
43
48 private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
49
59 private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
60 private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
61
67 public const ID_PRIMARY = 0;
68
75 public const ID_FALLBACK = 1;
76
82 private const HTML_ENTITIES = [
83 'Aacute' => 193,
84 'aacute' => 225,
85 'Acirc' => 194,
86 'acirc' => 226,
87 'acute' => 180,
88 'AElig' => 198,
89 'aelig' => 230,
90 'Agrave' => 192,
91 'agrave' => 224,
92 'alefsym' => 8501,
93 'Alpha' => 913,
94 'alpha' => 945,
95 'amp' => 38,
96 'and' => 8743,
97 'ang' => 8736,
98 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
99 'Aring' => 197,
100 'aring' => 229,
101 'asymp' => 8776,
102 'Atilde' => 195,
103 'atilde' => 227,
104 'Auml' => 196,
105 'auml' => 228,
106 'bdquo' => 8222,
107 'Beta' => 914,
108 'beta' => 946,
109 'brvbar' => 166,
110 'bull' => 8226,
111 'cap' => 8745,
112 'Ccedil' => 199,
113 'ccedil' => 231,
114 'cedil' => 184,
115 'cent' => 162,
116 'Chi' => 935,
117 'chi' => 967,
118 'circ' => 710,
119 'clubs' => 9827,
120 'cong' => 8773,
121 'copy' => 169,
122 'crarr' => 8629,
123 'cup' => 8746,
124 'curren' => 164,
125 'dagger' => 8224,
126 'Dagger' => 8225,
127 'darr' => 8595,
128 'dArr' => 8659,
129 'deg' => 176,
130 'Delta' => 916,
131 'delta' => 948,
132 'diams' => 9830,
133 'divide' => 247,
134 'Eacute' => 201,
135 'eacute' => 233,
136 'Ecirc' => 202,
137 'ecirc' => 234,
138 'Egrave' => 200,
139 'egrave' => 232,
140 'empty' => 8709,
141 'emsp' => 8195,
142 'ensp' => 8194,
143 'Epsilon' => 917,
144 'epsilon' => 949,
145 'equiv' => 8801,
146 'Eta' => 919,
147 'eta' => 951,
148 'ETH' => 208,
149 'eth' => 240,
150 'Euml' => 203,
151 'euml' => 235,
152 'euro' => 8364,
153 'exist' => 8707,
154 'fnof' => 402,
155 'forall' => 8704,
156 'frac12' => 189,
157 'frac14' => 188,
158 'frac34' => 190,
159 'frasl' => 8260,
160 'Gamma' => 915,
161 'gamma' => 947,
162 'ge' => 8805,
163 'gt' => 62,
164 'harr' => 8596,
165 'hArr' => 8660,
166 'hearts' => 9829,
167 'hellip' => 8230,
168 'Iacute' => 205,
169 'iacute' => 237,
170 'Icirc' => 206,
171 'icirc' => 238,
172 'iexcl' => 161,
173 'Igrave' => 204,
174 'igrave' => 236,
175 'image' => 8465,
176 'infin' => 8734,
177 'int' => 8747,
178 'Iota' => 921,
179 'iota' => 953,
180 'iquest' => 191,
181 'isin' => 8712,
182 'Iuml' => 207,
183 'iuml' => 239,
184 'Kappa' => 922,
185 'kappa' => 954,
186 'Lambda' => 923,
187 'lambda' => 955,
188 'lang' => 9001,
189 'laquo' => 171,
190 'larr' => 8592,
191 'lArr' => 8656,
192 'lceil' => 8968,
193 'ldquo' => 8220,
194 'le' => 8804,
195 'lfloor' => 8970,
196 'lowast' => 8727,
197 'loz' => 9674,
198 'lrm' => 8206,
199 'lsaquo' => 8249,
200 'lsquo' => 8216,
201 'lt' => 60,
202 'macr' => 175,
203 'mdash' => 8212,
204 'micro' => 181,
205 'middot' => 183,
206 'minus' => 8722,
207 'Mu' => 924,
208 'mu' => 956,
209 'nabla' => 8711,
210 'nbsp' => 160,
211 'ndash' => 8211,
212 'ne' => 8800,
213 'ni' => 8715,
214 'not' => 172,
215 'notin' => 8713,
216 'nsub' => 8836,
217 'Ntilde' => 209,
218 'ntilde' => 241,
219 'Nu' => 925,
220 'nu' => 957,
221 'Oacute' => 211,
222 'oacute' => 243,
223 'Ocirc' => 212,
224 'ocirc' => 244,
225 'OElig' => 338,
226 'oelig' => 339,
227 'Ograve' => 210,
228 'ograve' => 242,
229 'oline' => 8254,
230 'Omega' => 937,
231 'omega' => 969,
232 'Omicron' => 927,
233 'omicron' => 959,
234 'oplus' => 8853,
235 'or' => 8744,
236 'ordf' => 170,
237 'ordm' => 186,
238 'Oslash' => 216,
239 'oslash' => 248,
240 'Otilde' => 213,
241 'otilde' => 245,
242 'otimes' => 8855,
243 'Ouml' => 214,
244 'ouml' => 246,
245 'para' => 182,
246 'part' => 8706,
247 'permil' => 8240,
248 'perp' => 8869,
249 'Phi' => 934,
250 'phi' => 966,
251 'Pi' => 928,
252 'pi' => 960,
253 'piv' => 982,
254 'plusmn' => 177,
255 'pound' => 163,
256 'prime' => 8242,
257 'Prime' => 8243,
258 'prod' => 8719,
259 'prop' => 8733,
260 'Psi' => 936,
261 'psi' => 968,
262 'quot' => 34,
263 'radic' => 8730,
264 'rang' => 9002,
265 'raquo' => 187,
266 'rarr' => 8594,
267 'rArr' => 8658,
268 'rceil' => 8969,
269 'rdquo' => 8221,
270 'real' => 8476,
271 'reg' => 174,
272 'rfloor' => 8971,
273 'Rho' => 929,
274 'rho' => 961,
275 'rlm' => 8207,
276 'rsaquo' => 8250,
277 'rsquo' => 8217,
278 'sbquo' => 8218,
279 'Scaron' => 352,
280 'scaron' => 353,
281 'sdot' => 8901,
282 'sect' => 167,
283 'shy' => 173,
284 'Sigma' => 931,
285 'sigma' => 963,
286 'sigmaf' => 962,
287 'sim' => 8764,
288 'spades' => 9824,
289 'sub' => 8834,
290 'sube' => 8838,
291 'sum' => 8721,
292 'sup' => 8835,
293 'sup1' => 185,
294 'sup2' => 178,
295 'sup3' => 179,
296 'supe' => 8839,
297 'szlig' => 223,
298 'Tau' => 932,
299 'tau' => 964,
300 'there4' => 8756,
301 'Theta' => 920,
302 'theta' => 952,
303 'thetasym' => 977,
304 'thinsp' => 8201,
305 'THORN' => 222,
306 'thorn' => 254,
307 'tilde' => 732,
308 'times' => 215,
309 'trade' => 8482,
310 'Uacute' => 218,
311 'uacute' => 250,
312 'uarr' => 8593,
313 'uArr' => 8657,
314 'Ucirc' => 219,
315 'ucirc' => 251,
316 'Ugrave' => 217,
317 'ugrave' => 249,
318 'uml' => 168,
319 'upsih' => 978,
320 'Upsilon' => 933,
321 'upsilon' => 965,
322 'Uuml' => 220,
323 'uuml' => 252,
324 'weierp' => 8472,
325 'Xi' => 926,
326 'xi' => 958,
327 'Yacute' => 221,
328 'yacute' => 253,
329 'yen' => 165,
330 'Yuml' => 376,
331 'yuml' => 255,
332 'Zeta' => 918,
333 'zeta' => 950,
334 'zwj' => 8205,
335 'zwnj' => 8204
336 ];
337
341 private const HTML_ENTITY_ALIASES = [
342 'רלמ' => 'rlm',
343 'رلم' => 'rlm',
344 ];
345
349 private static $attribsRegex;
350
357 private static function getAttribsRegex() {
358 if ( self::$attribsRegex === null ) {
359 $spaceChars = '\x09\x0a\x0c\x0d\x20';
360 $space = "[{$spaceChars}]";
361 $attrib = "[^{$spaceChars}\/>=]";
362 $attribFirst = "(?:{$attrib}|=)";
363 self::$attribsRegex =
364 "/({$attribFirst}{$attrib}*)
365 ($space*=$space*
366 (?:
367 # The attribute value: quoted or alone
368 \"([^\"]*)(?:\"|\$)
369 | '([^']*)(?:'|\$)
370 | (((?!$space|>).)*)
371 )
372 )?/sxu";
373 }
374 return self::$attribsRegex;
375 }
376
380 private static $attribNameRegex;
381
386 private static function getAttribNameRegex() {
387 if ( self::$attribNameRegex === null ) {
388 $attribFirst = "[:_\p{L}\p{N}]";
389 $attrib = "[:_\.\-\p{L}\p{N}]";
390 self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
391 }
392 return self::$attribNameRegex;
393 }
394
401 public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
402 global $wgAllowImageTag;
403
404 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
405 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
406
407 // Base our staticInitialised variable off of the global config state so that if the globals
408 // are changed (like in the screwed up test system) we will re-initialise the settings.
409 $globalContext = $wgAllowImageTag;
410 if ( !$staticInitialised || $staticInitialised != $globalContext ) {
411 $htmlpairsStatic = [ # Tags that must be closed
412 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
413 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
414 'strike', 'strong', 'tt', 'var', 'div', 'center',
415 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
416 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
417 'kbd', 'samp', 'data', 'time', 'mark'
418 ];
419 $htmlsingle = [
420 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
421 ];
422
423 # Elements that cannot have close tags. This is (not coincidentally)
424 # also the list of tags for which the HTML 5 parsing algorithm
425 # requires you to "acknowledge the token's self-closing flag", i.e.
426 # a self-closing tag like <br/> is not an HTML 5 parse error only
427 # for this list.
428 $htmlsingleonly = [
429 'br', 'wbr', 'hr', 'meta', 'link'
430 ];
431
432 $htmlnest = [ # Tags that can be nested--??
433 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
434 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
435 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
436 ];
437 $tabletags = [ # Can only appear inside table, we will close them
438 'td', 'th', 'tr',
439 ];
440 $htmllist = [ # Tags used by list
441 'ul', 'ol',
442 ];
443 $listtags = [ # Tags that can appear in a list
444 'li',
445 ];
446
447 if ( $wgAllowImageTag ) {
448 wfDeprecatedMsg( 'Setting $wgAllowImageTag to true ' .
449 'is deprecated since MediaWiki 1.35', '1.35', false, false );
450 $htmlsingle[] = 'img';
451 $htmlsingleonly[] = 'img';
452 }
453
454 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
455 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
456
457 # Convert them all to hashtables for faster lookup
458 $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
459 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
460 foreach ( $vars as $var ) {
461 $$var = array_flip( $$var );
462 }
463 $staticInitialised = $globalContext;
464 }
465
466 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
467 $extratags = array_flip( $extratags );
468 $removetags = array_flip( $removetags );
469 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
470 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
471
472 return [
473 'htmlpairs' => $htmlpairs,
474 'htmlsingle' => $htmlsingle,
475 'htmlsingleonly' => $htmlsingleonly,
476 'htmlnest' => $htmlnest,
477 'tabletags' => $tabletags,
478 'htmllist' => $htmllist,
479 'listtags' => $listtags,
480 'htmlsingleallowed' => $htmlsingleallowed,
481 'htmlelements' => $htmlelements,
482 ];
483 }
484
496 public static function removeHTMLtags( $text, $processCallback = null,
497 $args = [], $extratags = [], $removetags = []
498 ) {
499 $tagData = self::getRecognizedTagData( $extratags, $removetags );
500 $htmlpairs = $tagData['htmlpairs'];
501 $htmlsingle = $tagData['htmlsingle'];
502 $htmlsingleonly = $tagData['htmlsingleonly'];
503 $htmlnest = $tagData['htmlnest'];
504 $tabletags = $tagData['tabletags'];
505 $htmllist = $tagData['htmllist'];
506 $listtags = $tagData['listtags'];
507 $htmlsingleallowed = $tagData['htmlsingleallowed'];
508 $htmlelements = $tagData['htmlelements'];
509
510 # Remove HTML comments
511 $text = self::removeHTMLcomments( $text );
512 $bits = explode( '<', $text );
513 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
514
515 # this might be possible using remex tidy itself
516 foreach ( $bits as $x ) {
517 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
518 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
519
520 $badtag = false;
521 $t = strtolower( $t );
522 if ( isset( $htmlelements[$t] ) ) {
523 if ( is_callable( $processCallback ) ) {
524 call_user_func_array( $processCallback, [ &$params, $args ] );
525 }
526
527 if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
528 // Remove the self-closing slash, to be consistent
529 // with HTML5 semantics. T134423
530 $brace = '>';
531 }
532 if ( !self::validateTag( $params, $t ) ) {
533 $badtag = true;
534 }
535
536 $newparams = self::fixTagAttributes( $params, $t );
537 if ( !$badtag ) {
538 if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
539 # Interpret self-closing tags as empty tags even when
540 # HTML 5 would interpret them as start tags. Such input
541 # is commonly seen on Wikimedia wikis with this intention.
542 $brace = "></$t>";
543 }
544
545 $rest = str_replace( '>', '&gt;', $rest );
546 $text .= "<$slash$t$newparams$brace$rest";
547 continue;
548 }
549 }
550 }
551 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
552 }
553 return $text;
554 }
555
565 public static function removeHTMLcomments( $text ) {
566 while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
567 $end = strpos( $text, '-->', $start + 4 );
568 if ( $end === false ) {
569 # Unterminated comment; bail out
570 break;
571 }
572
573 $end += 3;
574
575 # Trim space and newline if the comment is both
576 # preceded and followed by a newline
577 $spaceStart = max( $start - 1, 0 );
578 $spaceLen = $end - $spaceStart;
579 while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
580 $spaceStart--;
581 $spaceLen++;
582 }
583 while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
584 $spaceLen++;
585 }
586 if ( substr( $text, $spaceStart, 1 ) === "\n"
587 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
588 # Remove the comment, leading and trailing
589 # spaces, and leave only one newline.
590 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
591 } else {
592 # Remove just the comment.
593 $text = substr_replace( $text, '', $start, $end - $start );
594 }
595 }
596 return $text;
597 }
598
611 private static function validateTag( $params, $element ) {
612 $params = self::decodeTagAttributes( $params );
613
614 if ( $element == 'meta' || $element == 'link' ) {
615 if ( !isset( $params['itemprop'] ) ) {
616 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
617 return false;
618 }
619 if ( $element == 'meta' && !isset( $params['content'] ) ) {
620 // <meta> must have a content="" for the itemprop
621 return false;
622 }
623 if ( $element == 'link' && !isset( $params['href'] ) ) {
624 // <link> must have an associated href=""
625 return false;
626 }
627 }
628
629 return true;
630 }
631
647 public static function validateTagAttributes( $attribs, $element ) {
648 return self::validateAttributes( $attribs,
649 self::attributesAllowedInternal( $element ) );
650 }
651
670 public static function validateAttributes( $attribs, $allowed ) {
671 if ( isset( $allowed[0] ) ) {
672 // Calling this function with a sequential array is
673 // deprecated. For now just convert it.
674 wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
675 $allowed = array_flip( $allowed );
676 }
677 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
678
679 $out = [];
680 foreach ( $attribs as $attribute => $value ) {
681 # Allow XML namespace declaration to allow RDFa
682 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
683 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
684 $out[$attribute] = $value;
685 }
686
687 continue;
688 }
689
690 # Allow any attribute beginning with "data-"
691 # However:
692 # * Disallow data attributes used by MediaWiki code
693 # * Ensure that the attribute is not namespaced by banning
694 # colons.
695 if ( (
696 !preg_match( '/^data-[^:]*$/i', $attribute ) &&
697 !array_key_exists( $attribute, $allowed )
698 ) || self::isReservedDataAttribute( $attribute ) ) {
699 continue;
700 }
701
702 # Strip javascript "expression" from stylesheets.
703 # https://msdn.microsoft.com/en-us/library/ms537634.aspx
704 if ( $attribute == 'style' ) {
705 $value = self::checkCss( $value );
706 }
707
708 # Escape HTML id attributes
709 if ( $attribute === 'id' ) {
710 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
711 }
712
713 # Escape HTML id reference lists
714 if ( $attribute === 'aria-describedby'
715 || $attribute === 'aria-flowto'
716 || $attribute === 'aria-labelledby'
717 || $attribute === 'aria-owns'
718 ) {
719 $value = self::escapeIdReferenceList( $value );
720 }
721
722 // RDFa and microdata properties allow URLs, URIs and/or CURIs.
723 // Check them for sanity.
724 if ( $attribute === 'rel' || $attribute === 'rev'
725 # RDFa
726 || $attribute === 'about' || $attribute === 'property'
727 || $attribute === 'resource' || $attribute === 'datatype'
728 || $attribute === 'typeof'
729 # HTML5 microdata
730 || $attribute === 'itemid' || $attribute === 'itemprop'
731 || $attribute === 'itemref' || $attribute === 'itemscope'
732 || $attribute === 'itemtype'
733 ) {
734 // Paranoia. Allow "simple" values but suppress javascript
735 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
736 continue;
737 }
738 }
739
740 # NOTE: even though elements using href/src are not allowed directly, supply
741 # validation code that can be used by tag hook handlers, etc
742 if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
743 if ( !preg_match( $hrefExp, $value ) ) {
744 continue; // drop any href or src attributes not using an allowed protocol.
745 // NOTE: this also drops all relative URLs
746 }
747 }
748
749 if ( $attribute === 'tabindex' && $value !== '0' ) {
750 // Only allow tabindex of 0, which is useful for accessibility.
751 continue;
752 }
753
754 // If this attribute was previously set, override it.
755 // Output should only have one attribute of each name.
756 $out[$attribute] = $value;
757 }
758
759 # itemtype, itemid, itemref don't make sense without itemscope
760 if ( !array_key_exists( 'itemscope', $out ) ) {
761 unset( $out['itemtype'] );
762 unset( $out['itemid'] );
763 unset( $out['itemref'] );
764 }
765 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
766
767 return $out;
768 }
769
777 public static function isReservedDataAttribute( $attr ) {
778 // data-ooui is reserved for ooui.
779 // data-mw and data-parsoid are reserved for parsoid.
780 // data-mw-<name here> is reserved for extensions (or core) if
781 // they need to communicate some data to the client and want to be
782 // sure that it isn't coming from an untrusted user.
783 // We ignore the possibility of namespaces since user-generated HTML
784 // can't use them anymore.
785 return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
786 }
787
798 public static function mergeAttributes( $a, $b ) {
799 $out = array_merge( $a, $b );
800 if ( isset( $a['class'] ) && isset( $b['class'] )
801 && is_string( $a['class'] ) && is_string( $b['class'] )
802 && $a['class'] !== $b['class']
803 ) {
804 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
805 -1, PREG_SPLIT_NO_EMPTY );
806 $out['class'] = implode( ' ', array_unique( $classes ) );
807 }
808 return $out;
809 }
810
819 public static function normalizeCss( $value ) {
820 // Decode character references like &#123;
821 $value = self::decodeCharReferences( $value );
822
823 // Decode escape sequences and line continuation
824 // See the grammar in the CSS 2 spec, appendix D.
825 // This has to be done AFTER decoding character references.
826 // This means it isn't possible for this function to return
827 // unsanitized escape sequences. It is possible to manufacture
828 // input that contains character references that decode to
829 // escape sequences that decode to character references, but
830 // it's OK for the return value to contain character references
831 // because the caller is supposed to escape those anyway.
832 static $decodeRegex;
833 if ( !$decodeRegex ) {
834 $space = '[\\x20\\t\\r\\n\\f]';
835 $nl = '(?:\\n|\\r\\n|\\r|\\f)';
836 $backslash = '\\\\';
837 $decodeRegex = "/ $backslash
838 (?:
839 ($nl) | # 1. Line continuation
840 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
841 (.) | # 3. backslash cancelling special meaning
842 () | # 4. backslash at end of string
843 )/xu";
844 }
845 $value = preg_replace_callback( $decodeRegex,
846 [ __CLASS__, 'cssDecodeCallback' ], $value );
847
848 // Let the value through if it's nothing but a single comment, to
849 // allow other functions which may reject it to pass some error
850 // message through.
851 if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
852 // Remove any comments; IE gets token splitting wrong
853 // This must be done AFTER decoding character references and
854 // escape sequences, because those steps can introduce comments
855 // This step cannot introduce character references or escape
856 // sequences, because it replaces comments with spaces rather
857 // than removing them completely.
858 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
859
860 // Remove anything after a comment-start token, to guard against
861 // incorrect client implementations.
862 $commentPos = strpos( $value, '/*' );
863 if ( $commentPos !== false ) {
864 $value = substr( $value, 0, $commentPos );
865 }
866 }
867
868 return $value;
869 }
870
889 public static function checkCss( $value ) {
890 $value = self::normalizeCss( $value );
891
892 // Reject problematic keywords and control characters
893 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
894 strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
895 return '/* invalid control char */';
896 } elseif ( preg_match(
897 '! expression
898 | filter\s*:
899 | accelerator\s*:
900 | -o-link\s*:
901 | -o-link-source\s*:
902 | -o-replace\s*:
903 | url\s*\‍(
904 | image\s*\‍(
905 | image-set\s*\‍(
906 | attr\s*\‍([^)]+[\s,]+url
907 | var\s*\‍(
908 !ix', $value ) ) {
909 return '/* insecure input */';
910 }
911 return $value;
912 }
913
918 private static function cssDecodeCallback( $matches ) {
919 if ( $matches[1] !== '' ) {
920 // Line continuation
921 return '';
922 } elseif ( $matches[2] !== '' ) {
923 $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
924 } elseif ( $matches[3] !== '' ) {
925 $char = $matches[3];
926 } else {
927 $char = '\\';
928 }
929 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
930 // These characters need to be escaped in strings
931 // Clean up the escape sequence to avoid parsing errors by clients
932 return '\\' . dechex( ord( $char ) ) . ' ';
933 } else {
934 // Decode unnecessary escape
935 return $char;
936 }
937 }
938
960 public static function fixTagAttributes( $text, $element, $sorted = false ) {
961 if ( trim( $text ) == '' ) {
962 return '';
963 }
964
965 $decoded = self::decodeTagAttributes( $text );
966 $stripped = self::validateTagAttributes( $decoded, $element );
967
968 if ( $sorted ) {
969 ksort( $stripped );
970 }
971
972 return self::safeEncodeTagAttributes( $stripped );
973 }
974
980 public static function encodeAttribute( $text ) {
981 $encValue = htmlspecialchars( $text, ENT_QUOTES );
982
983 // Whitespace is normalized during attribute decoding,
984 // so if we've been passed non-spaces we must encode them
985 // ahead of time or they won't be preserved.
986 $encValue = strtr( $encValue, [
987 "\n" => '&#10;',
988 "\r" => '&#13;',
989 "\t" => '&#9;',
990 ] );
991
992 return $encValue;
993 }
994
1003 public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
1004 // Replace $ with \$ and \ with \\
1005 $space = preg_replace( '#(?<!\\\\‍)(\\$|\\\\‍)#', '\\\\$1', $space );
1006 $fixtags = [
1007 # French spaces, last one Guillemet-left
1008 # only if there is something before the space
1009 # and a non-word character after the punctuation.
1010 '/(?<=\S) (?=[?:;!%»›](?!\w))/u' => "$space",
1011 # French spaces, Guillemet-right
1012 '/([«‹]) /u' => "\\1$space",
1013 ];
1014 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
1015 }
1016
1023 public static function safeEncodeAttribute( $text ) {
1024 $encValue = self::encodeAttribute( $text );
1025
1026 # Templates and links may be expanded in later parsing,
1027 # creating invalid or dangerous output. Suppress this.
1028 $encValue = strtr( $encValue, [
1029 '<' => '&lt;', // This should never happen,
1030 '>' => '&gt;', // we've received invalid input
1031 '"' => '&quot;', // which should have been escaped.
1032 '{' => '&#123;',
1033 '}' => '&#125;', // prevent unpaired language conversion syntax
1034 '[' => '&#91;',
1035 ']' => '&#93;',
1036 "''" => '&#39;&#39;',
1037 'ISBN' => '&#73;SBN',
1038 'RFC' => '&#82;FC',
1039 'PMID' => '&#80;MID',
1040 '|' => '&#124;',
1041 '__' => '&#95;_',
1042 ] );
1043
1044 # Armor against French spaces detection (T5158)
1045 $encValue = self::armorFrenchSpaces( $encValue, '&#32;' );
1046
1047 # Stupid hack
1048 $encValue = preg_replace_callback(
1049 '/((?i)' . wfUrlProtocols() . ')/',
1050 function ( $matches ) {
1051 return str_replace( ':', '&#58;', $matches[1] );
1052 },
1053 $encValue );
1054 return $encValue;
1055 }
1056
1085 public static function escapeId( $id, $options = [] ) {
1086 wfDeprecated( __METHOD__, '1.30' );
1087 $options = (array)$options;
1088
1089 // HTML4-style escaping
1090 static $replace = [
1091 '%3A' => ':',
1092 '%' => '.'
1093 ];
1094
1095 $id = urlencode( strtr( $id, ' ', '_' ) );
1096 $id = strtr( $id, $replace );
1097
1098 if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) {
1099 // Initial character must be a letter!
1100 $id = "x$id";
1101 }
1102 return $id;
1103 }
1104
1120 public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
1121 global $wgFragmentMode;
1122
1123 if ( !isset( $wgFragmentMode[$mode] ) ) {
1124 if ( $mode === self::ID_PRIMARY ) {
1125 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
1126 }
1127 return false;
1128 }
1129
1130 $internalMode = $wgFragmentMode[$mode];
1131
1132 return self::escapeIdInternal( $id, $internalMode );
1133 }
1134
1147 public static function escapeIdForLink( $id ) {
1148 global $wgFragmentMode;
1149
1150 if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
1151 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
1152 }
1153
1154 $mode = $wgFragmentMode[self::ID_PRIMARY];
1155
1156 $id = self::escapeIdInternalUrl( $id, $mode );
1157
1158 return $id;
1159 }
1160
1170 public static function escapeIdForExternalInterwiki( $id ) {
1172
1173 $id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode );
1174
1175 return $id;
1176 }
1177
1187 private static function escapeIdInternalUrl( $id, $mode ) {
1188 $id = self::escapeIdInternal( $id, $mode );
1189 if ( $mode === 'html5' ) {
1190 $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
1191 }
1192 return $id;
1193 }
1194
1202 private static function escapeIdInternal( $id, $mode ) {
1203 // Truncate overly-long IDs. This isn't an HTML limit, it's just
1204 // griefer protection. [T251506]
1205 $id = mb_substr( $id, 0, 1024 );
1206
1207 switch ( $mode ) {
1208 case 'html5':
1209 // html5 spec says ids must not have any of the following:
1210 // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
1211 // In practice, in wikitext, only tab, LF, CR (and SPACE) are
1212 // possible using either Lua or html entities.
1213 $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
1214 break;
1215 case 'legacy':
1216 // This corresponds to 'noninitial' mode of the old escapeId()
1217 static $replace = [
1218 '%3A' => ':',
1219 '%' => '.'
1220 ];
1221
1222 $id = urlencode( str_replace( ' ', '_', $id ) );
1223 $id = strtr( $id, $replace );
1224 break;
1225 default:
1226 throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1227 }
1228
1229 return $id;
1230 }
1231
1241 public static function escapeIdReferenceList( $referenceString ) {
1242 # Explode the space delimited list string into an array of tokens
1243 $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1244
1245 # Escape each token as an id
1246 foreach ( $references as &$ref ) {
1247 $ref = self::escapeIdForAttribute( $ref );
1248 }
1249
1250 # Merge the array back to a space delimited list string
1251 # If the array is empty, the result will be an empty string ('')
1252 $referenceString = implode( ' ', $references );
1253
1254 return $referenceString;
1255 }
1256
1268 public static function escapeClass( $class ) {
1269 // Convert ugly stuff to underscores and kill underscores in ugly places
1270 return rtrim( preg_replace(
1271 [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1272 '_',
1273 $class ), '_' );
1274 }
1275
1283 public static function escapeHtmlAllowEntities( $html ) {
1284 $html = self::decodeCharReferences( $html );
1285 # It seems wise to escape ' as well as ", as a matter of course. Can't
1286 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1287 # don't cause the entire string to disappear.
1288 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1289 return $html;
1290 }
1291
1300 public static function decodeTagAttributes( $text ) {
1301 if ( trim( $text ) == '' ) {
1302 return [];
1303 }
1304
1305 $pairs = [];
1306 if ( !preg_match_all(
1307 self::getAttribsRegex(),
1308 $text,
1309 $pairs,
1310 PREG_SET_ORDER ) ) {
1311 return [];
1312 }
1313
1314 $attribs = [];
1315 foreach ( $pairs as $set ) {
1316 $attribute = strtolower( $set[1] );
1317
1318 // Filter attribute names with unacceptable characters
1319 if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1320 continue;
1321 }
1322
1323 $value = self::getTagAttributeCallback( $set );
1324
1325 // Normalize whitespace
1326 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1327 $value = trim( $value );
1328
1329 // Decode character references
1330 $attribs[$attribute] = self::decodeCharReferences( $value );
1331 }
1332 return $attribs;
1333 }
1334
1342 public static function safeEncodeTagAttributes( $assoc_array ) {
1343 $attribs = [];
1344 foreach ( $assoc_array as $attribute => $value ) {
1345 $encAttribute = htmlspecialchars( $attribute );
1346 $encValue = self::safeEncodeAttribute( $value );
1347
1348 $attribs[] = "$encAttribute=\"$encValue\"";
1349 }
1350 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1351 }
1352
1361 private static function getTagAttributeCallback( $set ) {
1362 if ( isset( $set[5] ) ) {
1363 # No quotes.
1364 return $set[5];
1365 } elseif ( isset( $set[4] ) ) {
1366 # Single-quoted
1367 return $set[4];
1368 } elseif ( isset( $set[3] ) ) {
1369 # Double-quoted
1370 return $set[3];
1371 } elseif ( !isset( $set[2] ) ) {
1372 # In XHTML, attributes must have a value so return an empty string.
1373 # See "Empty attribute syntax",
1374 # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1375 return "";
1376 } else {
1377 throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1378 }
1379 }
1380
1385 private static function normalizeWhitespace( $text ) {
1386 return trim( preg_replace(
1387 '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1388 ' ',
1389 $text ) );
1390 }
1391
1400 public static function normalizeSectionNameWhitespace( $section ) {
1401 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1402 }
1403
1419 public static function normalizeCharReferences( $text ) {
1420 return preg_replace_callback(
1421 self::CHAR_REFS_REGEX,
1422 [ self::class, 'normalizeCharReferencesCallback' ],
1423 $text );
1424 }
1425
1430 private static function normalizeCharReferencesCallback( $matches ) {
1431 $ret = null;
1432 if ( $matches[1] != '' ) {
1433 $ret = self::normalizeEntity( $matches[1] );
1434 } elseif ( $matches[2] != '' ) {
1435 $ret = self::decCharReference( $matches[2] );
1436 } elseif ( $matches[3] != '' ) {
1437 $ret = self::hexCharReference( $matches[3] );
1438 }
1439 if ( $ret === null ) {
1440 return htmlspecialchars( $matches[0] );
1441 } else {
1442 return $ret;
1443 }
1444 }
1445
1456 private static function normalizeEntity( $name ) {
1457 if ( isset( self::HTML_ENTITY_ALIASES[$name] ) ) {
1458 return '&' . self::HTML_ENTITY_ALIASES[$name] . ';';
1459 } elseif ( in_array( $name, [ 'lt', 'gt', 'amp', 'quot' ] ) ) {
1460 return "&$name;";
1461 } elseif ( isset( self::HTML_ENTITIES[$name] ) ) {
1462 return '&#' . self::HTML_ENTITIES[$name] . ';';
1463 } else {
1464 return "&amp;$name;";
1465 }
1466 }
1467
1472 private static function decCharReference( $codepoint ) {
1473 $point = intval( $codepoint );
1474 if ( self::validateCodepoint( $point ) ) {
1475 return sprintf( '&#%d;', $point );
1476 } else {
1477 return null;
1478 }
1479 }
1480
1485 private static function hexCharReference( $codepoint ) {
1486 $point = hexdec( $codepoint );
1487 if ( self::validateCodepoint( $point ) ) {
1488 return sprintf( '&#x%x;', $point );
1489 } else {
1490 return null;
1491 }
1492 }
1493
1500 private static function validateCodepoint( $codepoint ) {
1501 # U+000C is valid in HTML5 but not allowed in XML.
1502 # U+000D is valid in XML but not allowed in HTML5.
1503 # U+007F - U+009F are disallowed in HTML5 (control characters).
1504 return $codepoint == 0x09
1505 || $codepoint == 0x0a
1506 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1507 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1508 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1509 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1510 }
1511
1519 public static function decodeCharReferences( $text ) {
1520 return preg_replace_callback(
1521 self::CHAR_REFS_REGEX,
1522 [ self::class, 'decodeCharReferencesCallback' ],
1523 $text );
1524 }
1525
1536 public static function decodeCharReferencesAndNormalize( $text ) {
1537 $text = preg_replace_callback(
1538 self::CHAR_REFS_REGEX,
1539 [ self::class, 'decodeCharReferencesCallback' ],
1540 $text,
1541 -1, // limit
1542 $count
1543 );
1544
1545 if ( $count ) {
1546 return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1547 } else {
1548 return $text;
1549 }
1550 }
1551
1556 private static function decodeCharReferencesCallback( $matches ) {
1557 if ( $matches[1] != '' ) {
1558 return self::decodeEntity( $matches[1] );
1559 } elseif ( $matches[2] != '' ) {
1560 return self::decodeChar( intval( $matches[2] ) );
1561 } elseif ( $matches[3] != '' ) {
1562 return self::decodeChar( hexdec( $matches[3] ) );
1563 }
1564 # Last case should be an ampersand by itself
1565 return $matches[0];
1566 }
1567
1575 private static function decodeChar( $codepoint ) {
1576 if ( self::validateCodepoint( $codepoint ) ) {
1577 return UtfNormal\Utils::codepointToUtf8( $codepoint );
1578 } else {
1579 return UtfNormal\Constants::UTF8_REPLACEMENT;
1580 }
1581 }
1582
1591 private static function decodeEntity( $name ) {
1592 if ( isset( self::HTML_ENTITY_ALIASES[$name] ) ) {
1593 $name = self::HTML_ENTITY_ALIASES[$name];
1594 }
1595 if ( isset( self::HTML_ENTITIES[$name] ) ) {
1596 return UtfNormal\Utils::codepointToUtf8( self::HTML_ENTITIES[$name] );
1597 } else {
1598 return "&$name;";
1599 }
1600 }
1601
1609 private static function attributesAllowedInternal( $element ) {
1610 $list = self::setupAttributesAllowedInternal();
1611 return $list[$element] ?? [];
1612 }
1613
1621 private static function setupAttributesAllowedInternal() {
1622 static $allowed;
1623
1624 if ( $allowed !== null ) {
1625 return $allowed;
1626 }
1627
1628 // For lookup efficiency flip each attributes array so the keys are
1629 // the valid attributes.
1630 $merge = function ( $a, $b, $c = [] ) {
1631 return array_merge( $a, array_flip( $b ), array_flip( $c ) );
1632 };
1633 $common = $merge( [], [
1634 # HTML
1635 'id',
1636 'class',
1637 'style',
1638 'lang',
1639 'dir',
1640 'title',
1641 'tabindex',
1642
1643 # WAI-ARIA
1644 'aria-describedby',
1645 'aria-flowto',
1646 'aria-hidden',
1647 'aria-label',
1648 'aria-labelledby',
1649 'aria-owns',
1650 'role',
1651
1652 # RDFa
1653 # These attributes are specified in section 9 of
1654 # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1655 'about',
1656 'property',
1657 'resource',
1658 'datatype',
1659 'typeof',
1660
1661 # Microdata. These are specified by
1662 # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1663 'itemid',
1664 'itemprop',
1665 'itemref',
1666 'itemscope',
1667 'itemtype',
1668 ] );
1669
1670 $block = $merge( $common, [ 'align' ] );
1671
1672 $tablealign = [ 'align', 'valign' ];
1673 $tablecell = [
1674 'abbr',
1675 'axis',
1676 'headers',
1677 'scope',
1678 'rowspan',
1679 'colspan',
1680 'nowrap', # deprecated
1681 'width', # deprecated
1682 'height', # deprecated
1683 'bgcolor', # deprecated
1684 ];
1685
1686 # Numbers refer to sections in HTML 4.01 standard describing the element.
1687 # See: https://www.w3.org/TR/html4/
1688 $allowed = [
1689 # 7.5.4
1690 'div' => $block,
1691 'center' => $common, # deprecated
1692 'span' => $common,
1693
1694 # 7.5.5
1695 'h1' => $block,
1696 'h2' => $block,
1697 'h3' => $block,
1698 'h4' => $block,
1699 'h5' => $block,
1700 'h6' => $block,
1701
1702 # 7.5.6
1703 # address
1704
1705 # 8.2.4
1706 'bdo' => $common,
1707
1708 # 9.2.1
1709 'em' => $common,
1710 'strong' => $common,
1711 'cite' => $common,
1712 'dfn' => $common,
1713 'code' => $common,
1714 'samp' => $common,
1715 'kbd' => $common,
1716 'var' => $common,
1717 'abbr' => $common,
1718 # acronym
1719
1720 # 9.2.2
1721 'blockquote' => $merge( $common, [ 'cite' ] ),
1722 'q' => $merge( $common, [ 'cite' ] ),
1723
1724 # 9.2.3
1725 'sub' => $common,
1726 'sup' => $common,
1727
1728 # 9.3.1
1729 'p' => $block,
1730
1731 # 9.3.2
1732 'br' => $merge( $common, [ 'clear' ] ),
1733
1734 # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1735 'wbr' => $common,
1736
1737 # 9.3.4
1738 'pre' => $merge( $common, [ 'width' ] ),
1739
1740 # 9.4
1741 'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1742 'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1743
1744 # 10.2
1745 'ul' => $merge( $common, [ 'type' ] ),
1746 'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1747 'li' => $merge( $common, [ 'type', 'value' ] ),
1748
1749 # 10.3
1750 'dl' => $common,
1751 'dd' => $common,
1752 'dt' => $common,
1753
1754 # 11.2.1
1755 'table' => $merge( $common,
1756 [ 'summary', 'width', 'border', 'frame',
1757 'rules', 'cellspacing', 'cellpadding',
1758 'align', 'bgcolor',
1759 ] ),
1760
1761 # 11.2.2
1762 'caption' => $block,
1763
1764 # 11.2.3
1765 'thead' => $common,
1766 'tfoot' => $common,
1767 'tbody' => $common,
1768
1769 # 11.2.4
1770 'colgroup' => $merge( $common, [ 'span' ] ),
1771 'col' => $merge( $common, [ 'span' ] ),
1772
1773 # 11.2.5
1774 'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1775
1776 # 11.2.6
1777 'td' => $merge( $common, $tablecell, $tablealign ),
1778 'th' => $merge( $common, $tablecell, $tablealign ),
1779
1780 # 12.2
1781 # NOTE: <a> is not allowed directly, but this list of allowed
1782 # attributes is used from the Parser object
1783 'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1784
1785 # 13.2
1786 # Not usually allowed, but may be used for extension-style hooks
1787 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1788 # true
1789 'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1790 # Attributes for A/V tags added in T163583 / T133673
1791 'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1792 'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1793 'source' => $merge( $common, [ 'type', 'src' ] ),
1794 'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1795
1796 # 15.2.1
1797 'tt' => $common,
1798 'b' => $common,
1799 'i' => $common,
1800 'big' => $common,
1801 'small' => $common,
1802 'strike' => $common,
1803 's' => $common,
1804 'u' => $common,
1805
1806 # 15.2.2
1807 'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1808 # basefont
1809
1810 # 15.3
1811 'hr' => $merge( $common, [ 'width' ] ),
1812
1813 # HTML Ruby annotation text module, simple ruby only.
1814 # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1815 'ruby' => $common,
1816 # rbc
1817 'rb' => $common,
1818 'rp' => $common,
1819 'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1820 'rtc' => $common,
1821
1822 # MathML root element, where used for extensions
1823 # 'title' may not be 100% valid here; it's XHTML
1824 # https://www.w3.org/TR/REC-MathML/
1825 'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1826
1827 // HTML 5 section 4.5
1828 'figure' => $common,
1829 'figure-inline' => $common, # T118520
1830 'figcaption' => $common,
1831
1832 # HTML 5 section 4.6
1833 'bdi' => $common,
1834
1835 # HTML5 elements, defined by:
1836 # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1837 'data' => $merge( $common, [ 'value' ] ),
1838 'time' => $merge( $common, [ 'datetime' ] ),
1839 'mark' => $common,
1840
1841 // meta and link are only permitted by removeHTMLtags when Microdata
1842 // is enabled so we don't bother adding a conditional to hide these
1843 // Also meta and link are only valid in WikiText as Microdata elements
1844 // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1845 // So we don't bother including $common attributes that have no purpose.
1846 'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1847 'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1848 ];
1849
1850 return $allowed;
1851 }
1852
1864 public static function stripAllTags( $html ) {
1865 // Use RemexHtml to tokenize $html and extract the text
1866 $handler = new RemexStripTagHandler;
1867 $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
1868 'ignoreErrors' => true,
1869 // don't ignore char refs, we want them to be decoded
1870 'ignoreNulls' => true,
1871 'skipPreprocess' => true,
1872 ] );
1873 $tokenizer->execute();
1874 $text = $handler->getResult();
1875
1876 $text = self::normalizeWhitespace( $text );
1877 return $text;
1878 }
1879
1889 public static function hackDocType() {
1890 $out = "<!DOCTYPE html [\n";
1891 foreach ( self::HTML_ENTITIES as $entity => $codepoint ) {
1892 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1893 }
1894 $out .= "]>\n";
1895 return $out;
1896 }
1897
1902 public static function cleanUrl( $url ) {
1903 # Normalize any HTML entities in input. They will be
1904 # re-escaped by makeExternalLink().
1905 $url = self::decodeCharReferences( $url );
1906
1907 # Escape any control characters introduced by the above step
1908 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1909 [ __CLASS__, 'cleanUrlCallback' ], $url );
1910
1911 # Validate hostname portion
1912 $matches = [];
1913 if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1914 list( /* $whole */, $protocol, $host, $rest ) = $matches;
1915
1916 // Characters that will be ignored in IDNs.
1917 // https://tools.ietf.org/html/rfc3454#section-3.1
1918 // Strip them before further processing so deny lists and such work.
1919 $strip = "/
1920 \\s| # general whitespace
1921 \xc2\xad| # 00ad SOFT HYPHEN
1922 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1923 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1924 \xe2\x81\xa0| # 2060 WORD JOINER
1925 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1926 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1927 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1928 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1929 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1930 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1931 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1932 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
1933 /xuD";
1934
1935 $host = preg_replace( $strip, '', $host );
1936
1937 // IPv6 host names are bracketed with []. Url-decode these.
1938 if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 &&
1939 preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1940 ) {
1941 $host = '//[' . $matches[1] . ']' . $matches[2];
1942 }
1943
1944 // @todo FIXME: Validate hostnames here
1945
1946 return $protocol . $host . $rest;
1947 } else {
1948 return $url;
1949 }
1950 }
1951
1956 private static function cleanUrlCallback( $matches ) {
1957 return urlencode( $matches[0] );
1958 }
1959
1988 public static function validateEmail( $addr ) {
1989 $result = null;
1990 if ( !Hooks::runner()->onIsValidEmailAddr( $addr, $result ) ) {
1991 return $result;
1992 }
1993
1994 // Please note strings below are enclosed in brackets [], this make the
1995 // hyphen "-" a range indicator. Hence it is double backslashed below.
1996 // See T28948
1997 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1998 $rfc1034_ldh_str = "a-z0-9\\-";
1999
2000 $html5_email_regexp = "/
2001 ^ # start of string
2002 [$rfc5322_atext\\.]+ # user part which is liberal :p
2003 @ # 'apostrophe'
2004 [$rfc1034_ldh_str]+ # First domain part
2005 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
2006 $ # End of string
2007 /ix"; // case Insensitive, eXtended
2008
2009 return (bool)preg_match( $html5_email_regexp, $addr );
2010 }
2011}
$wgAllowImageTag
A different approach to the above: simply allow the "<img>" tag to be used.
$wgFragmentMode
How should section IDs be encoded? This array can contain 1 or 2 elements, each of them can be one of...
$wgExternalInterwikiFragmentMode
Which ID escaping mode should be used for external interwiki links? See documentation for $wgFragment...
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that $function is deprecated.
MediaWiki exception.
MediaWikiServices is the service locator for the application scope of MediaWiki.
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:33
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition Sanitizer.php:38
static cleanUrl( $url)
const HTML_ENTITY_ALIASES
Character entity aliases accepted by MediaWiki.
static decCharReference( $codepoint)
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
static escapeIdInternalUrl( $id, $mode)
Do percent encoding of percent signs for href (but not id) attributes.
static decodeChar( $codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
static getAttribNameRegex()
Used in Sanitizer::decodeTagAttributes to filter attributes.
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
static decodeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
static normalizeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the equivalent numeric entity re...
static validateAttributes( $attribs, $allowed)
Take an array of attribute names and values and normalize or discard illegal values.
static armorFrenchSpaces( $text, $space='&#160;')
Armor French spaces with a replacement character.
static setupAttributesAllowedInternal()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static removeHTMLcomments( $text)
Remove '', and everything between.
static encodeAttribute( $text)
Encode an attribute value for HTML output.
static hexCharReference( $codepoint)
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
const EVIL_URI_PATTERN
Pattern matching evil uris like javascript: WARNING: DO NOT use this in any place that actually requi...
Definition Sanitizer.php:59
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
static normalizeCharReferencesCallback( $matches)
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
static cleanUrlCallback( $matches)
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
static cssDecodeCallback( $matches)
static escapeIdReferenceList( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
static normalizeWhitespace( $text)
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
static validateCodepoint( $codepoint)
Returns true if a given Unicode codepoint is a valid character in both HTML5 and XML.
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
Definition Sanitizer.php:75
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
const HTML_ENTITIES
List of all named character entities defined in HTML 4.01 https://www.w3.org/TR/html4/sgml/entities....
Definition Sanitizer.php:82
static $attribNameRegex
Lazy-initialised attribute name regex, see getAttribNameRegex()
static getTagAttributeCallback( $set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
static escapeIdInternal( $id, $mode)
Helper for escapeIdFor*() functions.
static validateTag( $params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
static attributesAllowedInternal( $element)
Fetch the list of acceptable attributes for a given element name.
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
static escapeId( $id, $options=[])
Given a value, escape it so that it can be used in an id attribute and return it.
static decodeCharReferencesCallback( $matches)
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static validateEmail( $addr)
Does a string look like an e-mail address?
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
Definition Sanitizer.php:67
const ELEMENT_BITS_REGEX
Acceptable tag name charset from HTML5 parsing spec https://www.w3.org/TR/html5/syntax....
Definition Sanitizer.php:48
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
const XMLNS_ATTRIBUTE_PATTERN
Definition Sanitizer.php:60
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
if( $line===false) $args
Definition mcc.php:124