MediaWiki REL1_30
Sanitizer.php
Go to the documentation of this file.
1<?php
31class Sanitizer {
37 '/&([A-Za-z0-9\x80-\xff]+);
38 |&\#([0-9]+);
39 |&\#[xX]([0-9A-Fa-f]+);
40 |(&)/x';
41
46 const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
47
56 const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
57 const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
58
64 const ID_PRIMARY = 0;
65
72 const ID_FALLBACK = 1;
73
79 private static $htmlEntities = [
80 'Aacute' => 193,
81 'aacute' => 225,
82 'Acirc' => 194,
83 'acirc' => 226,
84 'acute' => 180,
85 'AElig' => 198,
86 'aelig' => 230,
87 'Agrave' => 192,
88 'agrave' => 224,
89 'alefsym' => 8501,
90 'Alpha' => 913,
91 'alpha' => 945,
92 'amp' => 38,
93 'and' => 8743,
94 'ang' => 8736,
95 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
96 'Aring' => 197,
97 'aring' => 229,
98 'asymp' => 8776,
99 'Atilde' => 195,
100 'atilde' => 227,
101 'Auml' => 196,
102 'auml' => 228,
103 'bdquo' => 8222,
104 'Beta' => 914,
105 'beta' => 946,
106 'brvbar' => 166,
107 'bull' => 8226,
108 'cap' => 8745,
109 'Ccedil' => 199,
110 'ccedil' => 231,
111 'cedil' => 184,
112 'cent' => 162,
113 'Chi' => 935,
114 'chi' => 967,
115 'circ' => 710,
116 'clubs' => 9827,
117 'cong' => 8773,
118 'copy' => 169,
119 'crarr' => 8629,
120 'cup' => 8746,
121 'curren' => 164,
122 'dagger' => 8224,
123 'Dagger' => 8225,
124 'darr' => 8595,
125 'dArr' => 8659,
126 'deg' => 176,
127 'Delta' => 916,
128 'delta' => 948,
129 'diams' => 9830,
130 'divide' => 247,
131 'Eacute' => 201,
132 'eacute' => 233,
133 'Ecirc' => 202,
134 'ecirc' => 234,
135 'Egrave' => 200,
136 'egrave' => 232,
137 'empty' => 8709,
138 'emsp' => 8195,
139 'ensp' => 8194,
140 'Epsilon' => 917,
141 'epsilon' => 949,
142 'equiv' => 8801,
143 'Eta' => 919,
144 'eta' => 951,
145 'ETH' => 208,
146 'eth' => 240,
147 'Euml' => 203,
148 'euml' => 235,
149 'euro' => 8364,
150 'exist' => 8707,
151 'fnof' => 402,
152 'forall' => 8704,
153 'frac12' => 189,
154 'frac14' => 188,
155 'frac34' => 190,
156 'frasl' => 8260,
157 'Gamma' => 915,
158 'gamma' => 947,
159 'ge' => 8805,
160 'gt' => 62,
161 'harr' => 8596,
162 'hArr' => 8660,
163 'hearts' => 9829,
164 'hellip' => 8230,
165 'Iacute' => 205,
166 'iacute' => 237,
167 'Icirc' => 206,
168 'icirc' => 238,
169 'iexcl' => 161,
170 'Igrave' => 204,
171 'igrave' => 236,
172 'image' => 8465,
173 'infin' => 8734,
174 'int' => 8747,
175 'Iota' => 921,
176 'iota' => 953,
177 'iquest' => 191,
178 'isin' => 8712,
179 'Iuml' => 207,
180 'iuml' => 239,
181 'Kappa' => 922,
182 'kappa' => 954,
183 'Lambda' => 923,
184 'lambda' => 955,
185 'lang' => 9001,
186 'laquo' => 171,
187 'larr' => 8592,
188 'lArr' => 8656,
189 'lceil' => 8968,
190 'ldquo' => 8220,
191 'le' => 8804,
192 'lfloor' => 8970,
193 'lowast' => 8727,
194 'loz' => 9674,
195 'lrm' => 8206,
196 'lsaquo' => 8249,
197 'lsquo' => 8216,
198 'lt' => 60,
199 'macr' => 175,
200 'mdash' => 8212,
201 'micro' => 181,
202 'middot' => 183,
203 'minus' => 8722,
204 'Mu' => 924,
205 'mu' => 956,
206 'nabla' => 8711,
207 'nbsp' => 160,
208 'ndash' => 8211,
209 'ne' => 8800,
210 'ni' => 8715,
211 'not' => 172,
212 'notin' => 8713,
213 'nsub' => 8836,
214 'Ntilde' => 209,
215 'ntilde' => 241,
216 'Nu' => 925,
217 'nu' => 957,
218 'Oacute' => 211,
219 'oacute' => 243,
220 'Ocirc' => 212,
221 'ocirc' => 244,
222 'OElig' => 338,
223 'oelig' => 339,
224 'Ograve' => 210,
225 'ograve' => 242,
226 'oline' => 8254,
227 'Omega' => 937,
228 'omega' => 969,
229 'Omicron' => 927,
230 'omicron' => 959,
231 'oplus' => 8853,
232 'or' => 8744,
233 'ordf' => 170,
234 'ordm' => 186,
235 'Oslash' => 216,
236 'oslash' => 248,
237 'Otilde' => 213,
238 'otilde' => 245,
239 'otimes' => 8855,
240 'Ouml' => 214,
241 'ouml' => 246,
242 'para' => 182,
243 'part' => 8706,
244 'permil' => 8240,
245 'perp' => 8869,
246 'Phi' => 934,
247 'phi' => 966,
248 'Pi' => 928,
249 'pi' => 960,
250 'piv' => 982,
251 'plusmn' => 177,
252 'pound' => 163,
253 'prime' => 8242,
254 'Prime' => 8243,
255 'prod' => 8719,
256 'prop' => 8733,
257 'Psi' => 936,
258 'psi' => 968,
259 'quot' => 34,
260 'radic' => 8730,
261 'rang' => 9002,
262 'raquo' => 187,
263 'rarr' => 8594,
264 'rArr' => 8658,
265 'rceil' => 8969,
266 'rdquo' => 8221,
267 'real' => 8476,
268 'reg' => 174,
269 'rfloor' => 8971,
270 'Rho' => 929,
271 'rho' => 961,
272 'rlm' => 8207,
273 'rsaquo' => 8250,
274 'rsquo' => 8217,
275 'sbquo' => 8218,
276 'Scaron' => 352,
277 'scaron' => 353,
278 'sdot' => 8901,
279 'sect' => 167,
280 'shy' => 173,
281 'Sigma' => 931,
282 'sigma' => 963,
283 'sigmaf' => 962,
284 'sim' => 8764,
285 'spades' => 9824,
286 'sub' => 8834,
287 'sube' => 8838,
288 'sum' => 8721,
289 'sup' => 8835,
290 'sup1' => 185,
291 'sup2' => 178,
292 'sup3' => 179,
293 'supe' => 8839,
294 'szlig' => 223,
295 'Tau' => 932,
296 'tau' => 964,
297 'there4' => 8756,
298 'Theta' => 920,
299 'theta' => 952,
300 'thetasym' => 977,
301 'thinsp' => 8201,
302 'THORN' => 222,
303 'thorn' => 254,
304 'tilde' => 732,
305 'times' => 215,
306 'trade' => 8482,
307 'Uacute' => 218,
308 'uacute' => 250,
309 'uarr' => 8593,
310 'uArr' => 8657,
311 'Ucirc' => 219,
312 'ucirc' => 251,
313 'Ugrave' => 217,
314 'ugrave' => 249,
315 'uml' => 168,
316 'upsih' => 978,
317 'Upsilon' => 933,
318 'upsilon' => 965,
319 'Uuml' => 220,
320 'uuml' => 252,
321 'weierp' => 8472,
322 'Xi' => 926,
323 'xi' => 958,
324 'Yacute' => 221,
325 'yacute' => 253,
326 'yen' => 165,
327 'Yuml' => 376,
328 'yuml' => 255,
329 'Zeta' => 918,
330 'zeta' => 950,
331 'zwj' => 8205,
332 'zwnj' => 8204
333 ];
334
338 private static $htmlEntityAliases = [
339 'רלמ' => 'rlm',
340 'رلم' => 'rlm',
341 ];
342
346 private static $attribsRegex;
347
355 static function getAttribsRegex() {
356 if ( self::$attribsRegex === null ) {
357 $attribFirst = "[:_\p{L}\p{N}]";
358 $attrib = "[:_\.\-\p{L}\p{N}]";
359 $space = '[\x09\x0a\x0c\x0d\x20]';
360 self::$attribsRegex =
361 "/(?:^|$space)({$attribFirst}{$attrib}*)
362 ($space*=$space*
363 (?:
364 # The attribute value: quoted or alone
365 \"([^\"]*)(?:\"|\$)
366 | '([^']*)(?:'|\$)
367 | (((?!$space|>).)*)
368 )
369 )?(?=$space|\$)/sxu";
370 }
371 return self::$attribsRegex;
372 }
373
380 public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
381 global $wgAllowImageTag;
382
383 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
384 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
385
386 // Base our staticInitialised variable off of the global config state so that if the globals
387 // are changed (like in the screwed up test system) we will re-initialise the settings.
388 $globalContext = $wgAllowImageTag;
389 if ( !$staticInitialised || $staticInitialised != $globalContext ) {
390 $htmlpairsStatic = [ # Tags that must be closed
391 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
392 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
393 'strike', 'strong', 'tt', 'var', 'div', 'center',
394 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
395 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
396 'kbd', 'samp', 'data', 'time', 'mark'
397 ];
398 $htmlsingle = [
399 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
400 ];
401
402 # Elements that cannot have close tags. This is (not coincidentally)
403 # also the list of tags for which the HTML 5 parsing algorithm
404 # requires you to "acknowledge the token's self-closing flag", i.e.
405 # a self-closing tag like <br/> is not an HTML 5 parse error only
406 # for this list.
407 $htmlsingleonly = [
408 'br', 'wbr', 'hr', 'meta', 'link'
409 ];
410
411 $htmlnest = [ # Tags that can be nested--??
412 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
413 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
414 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
415 ];
416 $tabletags = [ # Can only appear inside table, we will close them
417 'td', 'th', 'tr',
418 ];
419 $htmllist = [ # Tags used by list
420 'ul', 'ol',
421 ];
422 $listtags = [ # Tags that can appear in a list
423 'li',
424 ];
425
426 if ( $wgAllowImageTag ) {
427 $htmlsingle[] = 'img';
428 $htmlsingleonly[] = 'img';
429 }
430
431 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
432 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
433
434 # Convert them all to hashtables for faster lookup
435 $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
436 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
437 foreach ( $vars as $var ) {
438 $$var = array_flip( $$var );
439 }
440 $staticInitialised = $globalContext;
441 }
442
443 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
444 $extratags = array_flip( $extratags );
445 $removetags = array_flip( $removetags );
446 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
447 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
448
449 return [
450 'htmlpairs' => $htmlpairs,
451 'htmlsingle' => $htmlsingle,
452 'htmlsingleonly' => $htmlsingleonly,
453 'htmlnest' => $htmlnest,
454 'tabletags' => $tabletags,
455 'htmllist' => $htmllist,
456 'listtags' => $listtags,
457 'htmlsingleallowed' => $htmlsingleallowed,
458 'htmlelements' => $htmlelements,
459 ];
460 }
461
477 public static function removeHTMLtags( $text, $processCallback = null,
478 $args = [], $extratags = [], $removetags = [], $warnCallback = null
479 ) {
480 extract( self::getRecognizedTagData( $extratags, $removetags ) );
481
482 # Remove HTML comments
483 $text = self::removeHTMLcomments( $text );
484 $bits = explode( '<', $text );
485 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
486 if ( !MWTidy::isEnabled() ) {
487 $tagstack = $tablestack = [];
488 foreach ( $bits as $x ) {
489 $regs = [];
490 # $slash: Does the current element start with a '/'?
491 # $t: Current element name
492 # $params: String between element name and >
493 # $brace: Ending '>' or '/>'
494 # $rest: Everything until the next element of $bits
495 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
496 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
497 } else {
498 $slash = $t = $params = $brace = $rest = null;
499 }
500
501 $badtag = false;
502 $t = strtolower( $t );
503 if ( isset( $htmlelements[$t] ) ) {
504 # Check our stack
505 if ( $slash && isset( $htmlsingleonly[$t] ) ) {
506 $badtag = true;
507 } elseif ( $slash ) {
508 # Closing a tag... is it the one we just opened?
509 MediaWiki\suppressWarnings();
510 $ot = array_pop( $tagstack );
511 MediaWiki\restoreWarnings();
512
513 if ( $ot != $t ) {
514 if ( isset( $htmlsingleallowed[$ot] ) ) {
515 # Pop all elements with an optional close tag
516 # and see if we find a match below them
517 $optstack = [];
518 array_push( $optstack, $ot );
519 MediaWiki\suppressWarnings();
520 $ot = array_pop( $tagstack );
521 MediaWiki\restoreWarnings();
522 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
523 array_push( $optstack, $ot );
524 MediaWiki\suppressWarnings();
525 $ot = array_pop( $tagstack );
526 MediaWiki\restoreWarnings();
527 }
528 if ( $t != $ot ) {
529 # No match. Push the optional elements back again
530 $badtag = true;
531 MediaWiki\suppressWarnings();
532 $ot = array_pop( $optstack );
533 MediaWiki\restoreWarnings();
534 while ( $ot ) {
535 array_push( $tagstack, $ot );
536 MediaWiki\suppressWarnings();
537 $ot = array_pop( $optstack );
538 MediaWiki\restoreWarnings();
539 }
540 }
541 } else {
542 MediaWiki\suppressWarnings();
543 array_push( $tagstack, $ot );
544 MediaWiki\restoreWarnings();
545
546 # <li> can be nested in <ul> or <ol>, skip those cases:
547 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
548 $badtag = true;
549 }
550 }
551 } else {
552 if ( $t == 'table' ) {
553 $tagstack = array_pop( $tablestack );
554 }
555 }
556 $newparams = '';
557 } else {
558 # Keep track for later
559 if ( isset( $tabletags[$t] ) && !in_array( 'table', $tagstack ) ) {
560 $badtag = true;
561 } elseif ( in_array( $t, $tagstack ) && !isset( $htmlnest[$t] ) ) {
562 $badtag = true;
563 #  Is it a self closed htmlpair ? (T7487)
564 } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) {
565 // Eventually we'll just remove the self-closing
566 // slash, in order to be consistent with HTML5
567 // semantics.
568 // $brace = '>';
569 // For now, let's just warn authors to clean up.
570 if ( is_callable( $warnCallback ) ) {
571 call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
572 }
573 $badtag = true;
574 } elseif ( isset( $htmlsingleonly[$t] ) ) {
575 # Hack to force empty tag for unclosable elements
576 $brace = '/>';
577 } elseif ( isset( $htmlsingle[$t] ) ) {
578 # Hack to not close $htmlsingle tags
579 $brace = null;
580 # Still need to push this optionally-closed tag to
581 # the tag stack so that we can match end tags
582 # instead of marking them as bad.
583 array_push( $tagstack, $t );
584 } elseif ( isset( $tabletags[$t] ) && in_array( $t, $tagstack ) ) {
585 // New table tag but forgot to close the previous one
586 $text .= "</$t>";
587 } else {
588 if ( $t == 'table' ) {
589 array_push( $tablestack, $tagstack );
590 $tagstack = [];
591 }
592 array_push( $tagstack, $t );
593 }
594
595 # Replace any variables or template parameters with
596 # plaintext results.
597 if ( is_callable( $processCallback ) ) {
598 call_user_func_array( $processCallback, [ &$params, $args ] );
599 }
600
601 if ( !self::validateTag( $params, $t ) ) {
602 $badtag = true;
603 }
604
605 # Strip non-approved attributes from the tag
606 $newparams = self::fixTagAttributes( $params, $t );
607 }
608 if ( !$badtag ) {
609 $rest = str_replace( '>', '&gt;', $rest );
610 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
611 $text .= "<$slash$t$newparams$close>$rest";
612 continue;
613 }
614 }
615 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
616 }
617 # Close off any remaining tags
618 while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
619 $text .= "</$t>\n";
620 if ( $t == 'table' ) {
621 $tagstack = array_pop( $tablestack );
622 }
623 }
624 } else {
625 # this might be possible using tidy itself
626 foreach ( $bits as $x ) {
627 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
628 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
629
630 $badtag = false;
631 $t = strtolower( $t );
632 if ( isset( $htmlelements[$t] ) ) {
633 if ( is_callable( $processCallback ) ) {
634 call_user_func_array( $processCallback, [ &$params, $args ] );
635 }
636
637 if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
638 // Eventually we'll just remove the self-closing
639 // slash, in order to be consistent with HTML5
640 // semantics.
641 // $brace = '>';
642 // For now, let's just warn authors to clean up.
643 if ( is_callable( $warnCallback ) ) {
644 call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
645 }
646 }
647 if ( !self::validateTag( $params, $t ) ) {
648 $badtag = true;
649 }
650
651 $newparams = self::fixTagAttributes( $params, $t );
652 if ( !$badtag ) {
653 if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
654 # Interpret self-closing tags as empty tags even when
655 # HTML 5 would interpret them as start tags. Such input
656 # is commonly seen on Wikimedia wikis with this intention.
657 $brace = "></$t>";
658 }
659
660 $rest = str_replace( '>', '&gt;', $rest );
661 $text .= "<$slash$t$newparams$brace$rest";
662 continue;
663 }
664 }
665 }
666 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
667 }
668 }
669 return $text;
670 }
671
681 public static function removeHTMLcomments( $text ) {
682 while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
683 $end = strpos( $text, '-->', $start + 4 );
684 if ( $end === false ) {
685 # Unterminated comment; bail out
686 break;
687 }
688
689 $end += 3;
690
691 # Trim space and newline if the comment is both
692 # preceded and followed by a newline
693 $spaceStart = max( $start - 1, 0 );
694 $spaceLen = $end - $spaceStart;
695 while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
696 $spaceStart--;
697 $spaceLen++;
698 }
699 while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
700 $spaceLen++;
701 }
702 if ( substr( $text, $spaceStart, 1 ) === "\n"
703 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
704 # Remove the comment, leading and trailing
705 # spaces, and leave only one newline.
706 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
707 } else {
708 # Remove just the comment.
709 $text = substr_replace( $text, '', $start, $end - $start );
710 }
711 }
712 return $text;
713 }
714
727 static function validateTag( $params, $element ) {
728 $params = self::decodeTagAttributes( $params );
729
730 if ( $element == 'meta' || $element == 'link' ) {
731 if ( !isset( $params['itemprop'] ) ) {
732 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
733 return false;
734 }
735 if ( $element == 'meta' && !isset( $params['content'] ) ) {
736 // <meta> must have a content="" for the itemprop
737 return false;
738 }
739 if ( $element == 'link' && !isset( $params['href'] ) ) {
740 // <link> must have an associated href=""
741 return false;
742 }
743 }
744
745 return true;
746 }
747
763 static function validateTagAttributes( $attribs, $element ) {
764 return self::validateAttributes( $attribs,
765 self::attributeWhitelist( $element ) );
766 }
767
783 static function validateAttributes( $attribs, $whitelist ) {
784 $whitelist = array_flip( $whitelist );
785 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
786
787 $out = [];
788 foreach ( $attribs as $attribute => $value ) {
789 # Allow XML namespace declaration to allow RDFa
790 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
791 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
792 $out[$attribute] = $value;
793 }
794
795 continue;
796 }
797
798 # Allow any attribute beginning with "data-"
799 # However:
800 # * Disallow data attributes used by MediaWiki code
801 # * Ensure that the attribute is not namespaced by banning
802 # colons.
803 if ( !preg_match( '/^data-[^:]*$/i', $attribute )
804 && !isset( $whitelist[$attribute] )
805 || self::isReservedDataAttribute( $attribute )
806 ) {
807 continue;
808 }
809
810 # Strip javascript "expression" from stylesheets.
811 # https://msdn.microsoft.com/en-us/library/ms537634.aspx
812 if ( $attribute == 'style' ) {
813 $value = self::checkCss( $value );
814 }
815
816 # Escape HTML id attributes
817 if ( $attribute === 'id' ) {
818 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
819 }
820
821 # Escape HTML id reference lists
822 if ( $attribute === 'aria-describedby'
823 || $attribute === 'aria-flowto'
824 || $attribute === 'aria-labelledby'
825 || $attribute === 'aria-owns'
826 ) {
827 $value = self::escapeIdReferenceList( $value, 'noninitial' );
828 }
829
830 // RDFa and microdata properties allow URLs, URIs and/or CURIs.
831 // Check them for sanity.
832 if ( $attribute === 'rel' || $attribute === 'rev'
833 # RDFa
834 || $attribute === 'about' || $attribute === 'property'
835 || $attribute === 'resource' || $attribute === 'datatype'
836 || $attribute === 'typeof'
837 # HTML5 microdata
838 || $attribute === 'itemid' || $attribute === 'itemprop'
839 || $attribute === 'itemref' || $attribute === 'itemscope'
840 || $attribute === 'itemtype'
841 ) {
842 // Paranoia. Allow "simple" values but suppress javascript
843 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
844 continue;
845 }
846 }
847
848 # NOTE: even though elements using href/src are not allowed directly, supply
849 # validation code that can be used by tag hook handlers, etc
850 if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
851 if ( !preg_match( $hrefExp, $value ) ) {
852 continue; // drop any href or src attributes not using an allowed protocol.
853 // NOTE: this also drops all relative URLs
854 }
855 }
856
857 // If this attribute was previously set, override it.
858 // Output should only have one attribute of each name.
859 $out[$attribute] = $value;
860 }
861
862 # itemtype, itemid, itemref don't make sense without itemscope
863 if ( !array_key_exists( 'itemscope', $out ) ) {
864 unset( $out['itemtype'] );
865 unset( $out['itemid'] );
866 unset( $out['itemref'] );
867 }
868 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
869
870 return $out;
871 }
872
880 public static function isReservedDataAttribute( $attr ) {
881 // data-ooui is reserved for ooui.
882 // data-mw and data-parsoid are reserved for parsoid.
883 // data-mw-<name here> is reserved for extensions (or core) if
884 // they need to communicate some data to the client and want to be
885 // sure that it isn't coming from an untrusted user.
886 // We ignore the possibility of namespaces since user-generated HTML
887 // can't use them anymore.
888 return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
889 }
890
901 static function mergeAttributes( $a, $b ) {
902 $out = array_merge( $a, $b );
903 if ( isset( $a['class'] ) && isset( $b['class'] )
904 && is_string( $a['class'] ) && is_string( $b['class'] )
905 && $a['class'] !== $b['class']
906 ) {
907 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
908 -1, PREG_SPLIT_NO_EMPTY );
909 $out['class'] = implode( ' ', array_unique( $classes ) );
910 }
911 return $out;
912 }
913
923 public static function normalizeCss( $value ) {
924 // Decode character references like &#123;
925 $value = self::decodeCharReferences( $value );
926
927 // Decode escape sequences and line continuation
928 // See the grammar in the CSS 2 spec, appendix D.
929 // This has to be done AFTER decoding character references.
930 // This means it isn't possible for this function to return
931 // unsanitized escape sequences. It is possible to manufacture
932 // input that contains character references that decode to
933 // escape sequences that decode to character references, but
934 // it's OK for the return value to contain character references
935 // because the caller is supposed to escape those anyway.
936 static $decodeRegex;
937 if ( !$decodeRegex ) {
938 $space = '[\\x20\\t\\r\\n\\f]';
939 $nl = '(?:\\n|\\r\\n|\\r|\\f)';
940 $backslash = '\\\\';
941 $decodeRegex = "/ $backslash
942 (?:
943 ($nl) | # 1. Line continuation
944 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
945 (.) | # 3. backslash cancelling special meaning
946 () | # 4. backslash at end of string
947 )/xu";
948 }
949 $value = preg_replace_callback( $decodeRegex,
950 [ __CLASS__, 'cssDecodeCallback' ], $value );
951
952 // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
953 $value = preg_replace_callback(
954 '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (T60088)
955 function ( $matches ) {
956 $cp = UtfNormal\Utils::utf8ToCodepoint( $matches[0] );
957 if ( $cp === false ) {
958 return '';
959 }
960 return chr( $cp - 65248 ); // ASCII range \x21-\x7A
961 },
962 $value
963 );
964
965 // Convert more characters IE6 might treat as ascii
966 // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
967 $value = str_replace(
968 [ 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ],
969 [ 'r', 'n', 'n', 'l', 'i', '(', '(' ],
970 $value
971 );
972
973 // Let the value through if it's nothing but a single comment, to
974 // allow other functions which may reject it to pass some error
975 // message through.
976 if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
977 // Remove any comments; IE gets token splitting wrong
978 // This must be done AFTER decoding character references and
979 // escape sequences, because those steps can introduce comments
980 // This step cannot introduce character references or escape
981 // sequences, because it replaces comments with spaces rather
982 // than removing them completely.
983 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
984
985 // Remove anything after a comment-start token, to guard against
986 // incorrect client implementations.
987 $commentPos = strpos( $value, '/*' );
988 if ( $commentPos !== false ) {
989 $value = substr( $value, 0, $commentPos );
990 }
991 }
992
993 // S followed by repeat, iteration, or prolonged sound marks,
994 // which IE will treat as "ss"
995 $value = preg_replace(
996 '/s(?:
997 \xE3\x80\xB1 | # U+3031
998 \xE3\x82\x9D | # U+309D
999 \xE3\x83\xBC | # U+30FC
1000 \xE3\x83\xBD | # U+30FD
1001 \xEF\xB9\xBC | # U+FE7C
1002 \xEF\xB9\xBD | # U+FE7D
1003 \xEF\xBD\xB0 # U+FF70
1004 )/ix',
1005 'ss',
1006 $value
1007 );
1008
1009 return $value;
1010 }
1011
1030 static function checkCss( $value ) {
1031 $value = self::normalizeCss( $value );
1032
1033 // Reject problematic keywords and control characters
1034 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
1035 strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
1036 return '/* invalid control char */';
1037 } elseif ( preg_match(
1038 '! expression
1039 | filter\s*:
1040 | accelerator\s*:
1041 | -o-link\s*:
1042 | -o-link-source\s*:
1043 | -o-replace\s*:
1044 | url\s*\‍(
1045 | image\s*\‍(
1046 | image-set\s*\‍(
1047 | attr\s*\‍([^)]+[\s,]+url
1048 | var\s*\‍(
1049 !ix', $value ) ) {
1050 return '/* insecure input */';
1051 }
1052 return $value;
1053 }
1054
1059 static function cssDecodeCallback( $matches ) {
1060 if ( $matches[1] !== '' ) {
1061 // Line continuation
1062 return '';
1063 } elseif ( $matches[2] !== '' ) {
1064 $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
1065 } elseif ( $matches[3] !== '' ) {
1066 $char = $matches[3];
1067 } else {
1068 $char = '\\';
1069 }
1070 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
1071 // These characters need to be escaped in strings
1072 // Clean up the escape sequence to avoid parsing errors by clients
1073 return '\\' . dechex( ord( $char ) ) . ' ';
1074 } else {
1075 // Decode unnecessary escape
1076 return $char;
1077 }
1078 }
1079
1101 static function fixTagAttributes( $text, $element, $sorted = false ) {
1102 if ( trim( $text ) == '' ) {
1103 return '';
1104 }
1105
1106 $decoded = self::decodeTagAttributes( $text );
1107 $stripped = self::validateTagAttributes( $decoded, $element );
1108
1109 if ( $sorted ) {
1110 ksort( $stripped );
1111 }
1112
1113 return self::safeEncodeTagAttributes( $stripped );
1114 }
1115
1121 static function encodeAttribute( $text ) {
1122 $encValue = htmlspecialchars( $text, ENT_QUOTES );
1123
1124 // Whitespace is normalized during attribute decoding,
1125 // so if we've been passed non-spaces we must encode them
1126 // ahead of time or they won't be preserved.
1127 $encValue = strtr( $encValue, [
1128 "\n" => '&#10;',
1129 "\r" => '&#13;',
1130 "\t" => '&#9;',
1131 ] );
1132
1133 return $encValue;
1134 }
1135
1142 static function safeEncodeAttribute( $text ) {
1143 $encValue = self::encodeAttribute( $text );
1144
1145 # Templates and links may be expanded in later parsing,
1146 # creating invalid or dangerous output. Suppress this.
1147 $encValue = strtr( $encValue, [
1148 '<' => '&lt;', // This should never happen,
1149 '>' => '&gt;', // we've received invalid input
1150 '"' => '&quot;', // which should have been escaped.
1151 '{' => '&#123;',
1152 '}' => '&#125;', // prevent unpaired language conversion syntax
1153 '[' => '&#91;',
1154 "''" => '&#39;&#39;',
1155 'ISBN' => '&#73;SBN',
1156 'RFC' => '&#82;FC',
1157 'PMID' => '&#80;MID',
1158 '|' => '&#124;',
1159 '__' => '&#95;_',
1160 ] );
1161
1162 # Stupid hack
1163 $encValue = preg_replace_callback(
1164 '/((?i)' . wfUrlProtocols() . ')/',
1165 [ 'Sanitizer', 'armorLinksCallback' ],
1166 $encValue );
1167 return $encValue;
1168 }
1169
1203 static function escapeId( $id, $options = [] ) {
1206
1207 if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
1208 $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
1209 $id = trim( $id, '_' );
1210 if ( $id === '' ) {
1211 // Must have been all whitespace to start with.
1212 return '_';
1213 } else {
1214 return $id;
1215 }
1216 }
1217
1218 // HTML4-style escaping
1219 static $replace = [
1220 '%3A' => ':',
1221 '%' => '.'
1222 ];
1223
1224 $id = urlencode( strtr( $id, ' ', '_' ) );
1225 $id = strtr( $id, $replace );
1226
1227 if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) {
1228 // Initial character must be a letter!
1229 $id = "x$id";
1230 }
1231 return $id;
1232 }
1233
1249 public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
1250 global $wgFragmentMode;
1251
1252 if ( !isset( $wgFragmentMode[$mode] ) ) {
1253 if ( $mode === self::ID_PRIMARY ) {
1254 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
1255 }
1256 return false;
1257 }
1258
1259 $internalMode = $wgFragmentMode[$mode];
1260
1261 return self::escapeIdInternal( $id, $internalMode );
1262 }
1263
1276 public static function escapeIdForLink( $id ) {
1277 global $wgFragmentMode;
1278
1279 if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
1280 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
1281 }
1282
1283 $mode = $wgFragmentMode[self::ID_PRIMARY];
1284
1285 $id = self::escapeIdInternal( $id, $mode );
1286
1287 return $id;
1288 }
1289
1299 public static function escapeIdForExternalInterwiki( $id ) {
1301
1302 $id = self::escapeIdInternal( $id, $wgExternalInterwikiFragmentMode );
1303
1304 return $id;
1305 }
1306
1314 private static function escapeIdInternal( $id, $mode ) {
1315 switch ( $mode ) {
1316 case 'html5':
1317 $id = str_replace( ' ', '_', $id );
1318 break;
1319 case 'legacy':
1320 // This corresponds to 'noninitial' mode of the old escapeId()
1321 static $replace = [
1322 '%3A' => ':',
1323 '%' => '.'
1324 ];
1325
1326 $id = urlencode( str_replace( ' ', '_', $id ) );
1327 $id = strtr( $id, $replace );
1328 break;
1329 case 'html5-legacy':
1330 $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
1331 $id = trim( $id, '_' );
1332 if ( $id === '' ) {
1333 // Must have been all whitespace to start with.
1334 $id = '_';
1335 }
1336 break;
1337 default:
1338 throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1339 }
1340
1341 return $id;
1342 }
1343
1356 static function escapeIdReferenceList( $referenceString, $options = [] ) {
1357 # Explode the space delimited list string into an array of tokens
1358 $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1359
1360 # Escape each token as an id
1361 foreach ( $references as &$ref ) {
1362 $ref = self::escapeIdForAttribute( $ref );
1363 }
1364
1365 # Merge the array back to a space delimited list string
1366 # If the array is empty, the result will be an empty string ('')
1367 $referenceString = implode( ' ', $references );
1368
1369 return $referenceString;
1370 }
1371
1383 static function escapeClass( $class ) {
1384 // Convert ugly stuff to underscores and kill underscores in ugly places
1385 return rtrim( preg_replace(
1386 [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1387 '_',
1388 $class ), '_' );
1389 }
1390
1398 static function escapeHtmlAllowEntities( $html ) {
1399 $html = self::decodeCharReferences( $html );
1400 # It seems wise to escape ' as well as ", as a matter of course. Can't
1401 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1402 # don't cause the entire string to disappear.
1403 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1404 return $html;
1405 }
1406
1412 private static function armorLinksCallback( $matches ) {
1413 return str_replace( ':', '&#58;', $matches[1] );
1414 }
1415
1424 public static function decodeTagAttributes( $text ) {
1425 if ( trim( $text ) == '' ) {
1426 return [];
1427 }
1428
1429 $attribs = [];
1430 $pairs = [];
1431 if ( !preg_match_all(
1432 self::getAttribsRegex(),
1433 $text,
1434 $pairs,
1435 PREG_SET_ORDER ) ) {
1436 return $attribs;
1437 }
1438
1439 foreach ( $pairs as $set ) {
1440 $attribute = strtolower( $set[1] );
1441 $value = self::getTagAttributeCallback( $set );
1442
1443 // Normalize whitespace
1444 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1445 $value = trim( $value );
1446
1447 // Decode character references
1448 $attribs[$attribute] = self::decodeCharReferences( $value );
1449 }
1450 return $attribs;
1451 }
1452
1460 public static function safeEncodeTagAttributes( $assoc_array ) {
1461 $attribs = [];
1462 foreach ( $assoc_array as $attribute => $value ) {
1463 $encAttribute = htmlspecialchars( $attribute );
1464 $encValue = self::safeEncodeAttribute( $value );
1465
1466 $attribs[] = "$encAttribute=\"$encValue\"";
1467 }
1468 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1469 }
1470
1479 private static function getTagAttributeCallback( $set ) {
1480 if ( isset( $set[5] ) ) {
1481 # No quotes.
1482 return $set[5];
1483 } elseif ( isset( $set[4] ) ) {
1484 # Single-quoted
1485 return $set[4];
1486 } elseif ( isset( $set[3] ) ) {
1487 # Double-quoted
1488 return $set[3];
1489 } elseif ( !isset( $set[2] ) ) {
1490 # In XHTML, attributes must have a value so return an empty string.
1491 # See "Empty attribute syntax",
1492 # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1493 return "";
1494 } else {
1495 throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1496 }
1497 }
1498
1503 private static function normalizeWhitespace( $text ) {
1504 return preg_replace(
1505 '/\r\n|[\x20\x0d\x0a\x09]/',
1506 ' ',
1507 $text );
1508 }
1509
1519 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1520 }
1521
1537 static function normalizeCharReferences( $text ) {
1538 return preg_replace_callback(
1539 self::CHAR_REFS_REGEX,
1540 [ 'Sanitizer', 'normalizeCharReferencesCallback' ],
1541 $text );
1542 }
1543
1549 $ret = null;
1550 if ( $matches[1] != '' ) {
1551 $ret = self::normalizeEntity( $matches[1] );
1552 } elseif ( $matches[2] != '' ) {
1553 $ret = self::decCharReference( $matches[2] );
1554 } elseif ( $matches[3] != '' ) {
1555 $ret = self::hexCharReference( $matches[3] );
1556 }
1557 if ( is_null( $ret ) ) {
1558 return htmlspecialchars( $matches[0] );
1559 } else {
1560 return $ret;
1561 }
1562 }
1563
1574 static function normalizeEntity( $name ) {
1575 if ( isset( self::$htmlEntityAliases[$name] ) ) {
1576 return '&' . self::$htmlEntityAliases[$name] . ';';
1577 } elseif ( in_array( $name, [ 'lt', 'gt', 'amp', 'quot' ] ) ) {
1578 return "&$name;";
1579 } elseif ( isset( self::$htmlEntities[$name] ) ) {
1580 return '&#' . self::$htmlEntities[$name] . ';';
1581 } else {
1582 return "&amp;$name;";
1583 }
1584 }
1585
1590 static function decCharReference( $codepoint ) {
1591 $point = intval( $codepoint );
1592 if ( self::validateCodepoint( $point ) ) {
1593 return sprintf( '&#%d;', $point );
1594 } else {
1595 return null;
1596 }
1597 }
1598
1603 static function hexCharReference( $codepoint ) {
1604 $point = hexdec( $codepoint );
1605 if ( self::validateCodepoint( $point ) ) {
1606 return sprintf( '&#x%x;', $point );
1607 } else {
1608 return null;
1609 }
1610 }
1611
1618 private static function validateCodepoint( $codepoint ) {
1619 # U+000C is valid in HTML5 but not allowed in XML.
1620 # U+000D is valid in XML but not allowed in HTML5.
1621 # U+007F - U+009F are disallowed in HTML5 (control characters).
1622 return $codepoint == 0x09
1623 || $codepoint == 0x0a
1624 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1625 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1626 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1627 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1628 }
1629
1637 public static function decodeCharReferences( $text ) {
1638 return preg_replace_callback(
1639 self::CHAR_REFS_REGEX,
1640 [ 'Sanitizer', 'decodeCharReferencesCallback' ],
1641 $text );
1642 }
1643
1654 public static function decodeCharReferencesAndNormalize( $text ) {
1655 global $wgContLang;
1656 $text = preg_replace_callback(
1657 self::CHAR_REFS_REGEX,
1658 [ 'Sanitizer', 'decodeCharReferencesCallback' ],
1659 $text,
1660 -1, //limit
1661 $count
1662 );
1663
1664 if ( $count ) {
1665 return $wgContLang->normalize( $text );
1666 } else {
1667 return $text;
1668 }
1669 }
1670
1676 if ( $matches[1] != '' ) {
1677 return self::decodeEntity( $matches[1] );
1678 } elseif ( $matches[2] != '' ) {
1679 return self::decodeChar( intval( $matches[2] ) );
1680 } elseif ( $matches[3] != '' ) {
1681 return self::decodeChar( hexdec( $matches[3] ) );
1682 }
1683 # Last case should be an ampersand by itself
1684 return $matches[0];
1685 }
1686
1694 static function decodeChar( $codepoint ) {
1695 if ( self::validateCodepoint( $codepoint ) ) {
1696 return UtfNormal\Utils::codepointToUtf8( $codepoint );
1697 } else {
1698 return UtfNormal\Constants::UTF8_REPLACEMENT;
1699 }
1700 }
1701
1710 static function decodeEntity( $name ) {
1711 if ( isset( self::$htmlEntityAliases[$name] ) ) {
1712 $name = self::$htmlEntityAliases[$name];
1713 }
1714 if ( isset( self::$htmlEntities[$name] ) ) {
1715 return UtfNormal\Utils::codepointToUtf8( self::$htmlEntities[$name] );
1716 } else {
1717 return "&$name;";
1718 }
1719 }
1720
1727 static function attributeWhitelist( $element ) {
1728 $list = self::setupAttributeWhitelist();
1729 return isset( $list[$element] )
1730 ? $list[$element]
1731 : [];
1732 }
1733
1739 static function setupAttributeWhitelist() {
1740 static $whitelist;
1741
1742 if ( $whitelist !== null ) {
1743 return $whitelist;
1744 }
1745
1746 $common = [
1747 # HTML
1748 'id',
1749 'class',
1750 'style',
1751 'lang',
1752 'dir',
1753 'title',
1754
1755 # WAI-ARIA
1756 'aria-describedby',
1757 'aria-flowto',
1758 'aria-label',
1759 'aria-labelledby',
1760 'aria-owns',
1761 'role',
1762
1763 # RDFa
1764 # These attributes are specified in section 9 of
1765 # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1766 'about',
1767 'property',
1768 'resource',
1769 'datatype',
1770 'typeof',
1771
1772 # Microdata. These are specified by
1773 # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1774 'itemid',
1775 'itemprop',
1776 'itemref',
1777 'itemscope',
1778 'itemtype',
1779 ];
1780
1781 $block = array_merge( $common, [ 'align' ] );
1782 $tablealign = [ 'align', 'valign' ];
1783 $tablecell = [
1784 'abbr',
1785 'axis',
1786 'headers',
1787 'scope',
1788 'rowspan',
1789 'colspan',
1790 'nowrap', # deprecated
1791 'width', # deprecated
1792 'height', # deprecated
1793 'bgcolor', # deprecated
1794 ];
1795
1796 # Numbers refer to sections in HTML 4.01 standard describing the element.
1797 # See: https://www.w3.org/TR/html4/
1798 $whitelist = [
1799 # 7.5.4
1800 'div' => $block,
1801 'center' => $common, # deprecated
1802 'span' => $common,
1803
1804 # 7.5.5
1805 'h1' => $block,
1806 'h2' => $block,
1807 'h3' => $block,
1808 'h4' => $block,
1809 'h5' => $block,
1810 'h6' => $block,
1811
1812 # 7.5.6
1813 # address
1814
1815 # 8.2.4
1816 'bdo' => $common,
1817
1818 # 9.2.1
1819 'em' => $common,
1820 'strong' => $common,
1821 'cite' => $common,
1822 'dfn' => $common,
1823 'code' => $common,
1824 'samp' => $common,
1825 'kbd' => $common,
1826 'var' => $common,
1827 'abbr' => $common,
1828 # acronym
1829
1830 # 9.2.2
1831 'blockquote' => array_merge( $common, [ 'cite' ] ),
1832 'q' => array_merge( $common, [ 'cite' ] ),
1833
1834 # 9.2.3
1835 'sub' => $common,
1836 'sup' => $common,
1837
1838 # 9.3.1
1839 'p' => $block,
1840
1841 # 9.3.2
1842 'br' => array_merge( $common, [ 'clear' ] ),
1843
1844 # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1845 'wbr' => $common,
1846
1847 # 9.3.4
1848 'pre' => array_merge( $common, [ 'width' ] ),
1849
1850 # 9.4
1851 'ins' => array_merge( $common, [ 'cite', 'datetime' ] ),
1852 'del' => array_merge( $common, [ 'cite', 'datetime' ] ),
1853
1854 # 10.2
1855 'ul' => array_merge( $common, [ 'type' ] ),
1856 'ol' => array_merge( $common, [ 'type', 'start', 'reversed' ] ),
1857 'li' => array_merge( $common, [ 'type', 'value' ] ),
1858
1859 # 10.3
1860 'dl' => $common,
1861 'dd' => $common,
1862 'dt' => $common,
1863
1864 # 11.2.1
1865 'table' => array_merge( $common,
1866 [ 'summary', 'width', 'border', 'frame',
1867 'rules', 'cellspacing', 'cellpadding',
1868 'align', 'bgcolor',
1869 ] ),
1870
1871 # 11.2.2
1872 'caption' => $block,
1873
1874 # 11.2.3
1875 'thead' => $common,
1876 'tfoot' => $common,
1877 'tbody' => $common,
1878
1879 # 11.2.4
1880 'colgroup' => array_merge( $common, [ 'span' ] ),
1881 'col' => array_merge( $common, [ 'span' ] ),
1882
1883 # 11.2.5
1884 'tr' => array_merge( $common, [ 'bgcolor' ], $tablealign ),
1885
1886 # 11.2.6
1887 'td' => array_merge( $common, $tablecell, $tablealign ),
1888 'th' => array_merge( $common, $tablecell, $tablealign ),
1889
1890 # 12.2
1891 # NOTE: <a> is not allowed directly, but the attrib
1892 # whitelist is used from the Parser object
1893 'a' => array_merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1894
1895 # 13.2
1896 # Not usually allowed, but may be used for extension-style hooks
1897 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1898 # true
1899 'img' => array_merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1900
1901 'video' => array_merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1902 'source' => array_merge( $common, [ 'type', 'src' ] ),
1903 'track' => array_merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1904
1905 # 15.2.1
1906 'tt' => $common,
1907 'b' => $common,
1908 'i' => $common,
1909 'big' => $common,
1910 'small' => $common,
1911 'strike' => $common,
1912 's' => $common,
1913 'u' => $common,
1914
1915 # 15.2.2
1916 'font' => array_merge( $common, [ 'size', 'color', 'face' ] ),
1917 # basefont
1918
1919 # 15.3
1920 'hr' => array_merge( $common, [ 'width' ] ),
1921
1922 # HTML Ruby annotation text module, simple ruby only.
1923 # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1924 'ruby' => $common,
1925 # rbc
1926 'rb' => $common,
1927 'rp' => $common,
1928 'rt' => $common, # array_merge( $common, array( 'rbspan' ) ),
1929 'rtc' => $common,
1930
1931 # MathML root element, where used for extensions
1932 # 'title' may not be 100% valid here; it's XHTML
1933 # https://www.w3.org/TR/REC-MathML/
1934 'math' => [ 'class', 'style', 'id', 'title' ],
1935
1936 // HTML 5 section 4.5
1937 'figure' => $common,
1938 'figcaption' => $common,
1939
1940 # HTML 5 section 4.6
1941 'bdi' => $common,
1942
1943 # HTML5 elements, defined by:
1944 # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1945 'data' => array_merge( $common, [ 'value' ] ),
1946 'time' => array_merge( $common, [ 'datetime' ] ),
1947 'mark' => $common,
1948
1949 // meta and link are only permitted by removeHTMLtags when Microdata
1950 // is enabled so we don't bother adding a conditional to hide these
1951 // Also meta and link are only valid in WikiText as Microdata elements
1952 // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1953 // So we don't bother including $common attributes that have no purpose.
1954 'meta' => [ 'itemprop', 'content' ],
1955 'link' => [ 'itemprop', 'href', 'title' ],
1956 ];
1957
1958 return $whitelist;
1959 }
1960
1971 static function stripAllTags( $text ) {
1972 # Actual <tags>
1973 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1974
1975 # Normalize &entities and whitespace
1976 $text = self::decodeCharReferences( $text );
1977 $text = self::normalizeWhitespace( $text );
1978
1979 return $text;
1980 }
1981
1991 static function hackDocType() {
1992 $out = "<!DOCTYPE html [\n";
1993 foreach ( self::$htmlEntities as $entity => $codepoint ) {
1994 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1995 }
1996 $out .= "]>\n";
1997 return $out;
1998 }
1999
2004 static function cleanUrl( $url ) {
2005 # Normalize any HTML entities in input. They will be
2006 # re-escaped by makeExternalLink().
2007 $url = self::decodeCharReferences( $url );
2008
2009 # Escape any control characters introduced by the above step
2010 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
2011 [ __CLASS__, 'cleanUrlCallback' ], $url );
2012
2013 # Validate hostname portion
2014 $matches = [];
2015 if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
2016 list( /* $whole */, $protocol, $host, $rest ) = $matches;
2017
2018 // Characters that will be ignored in IDNs.
2019 // https://tools.ietf.org/html/rfc3454#section-3.1
2020 // Strip them before further processing so blacklists and such work.
2021 $strip = "/
2022 \\s| # general whitespace
2023 \xc2\xad| # 00ad SOFT HYPHEN
2024 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
2025 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
2026 \xe2\x81\xa0| # 2060 WORD JOINER
2027 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
2028 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
2029 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
2030 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
2031 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
2032 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
2033 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
2034 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
2035 /xuD";
2036
2037 $host = preg_replace( $strip, '', $host );
2038
2039 // IPv6 host names are bracketed with []. Url-decode these.
2040 if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 &&
2041 preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
2042 ) {
2043 $host = '//[' . $matches[1] . ']' . $matches[2];
2044 }
2045
2046 // @todo FIXME: Validate hostnames here
2047
2048 return $protocol . $host . $rest;
2049 } else {
2050 return $url;
2051 }
2052 }
2053
2058 static function cleanUrlCallback( $matches ) {
2059 return urlencode( $matches[0] );
2060 }
2061
2090 public static function validateEmail( $addr ) {
2091 $result = null;
2092 if ( !Hooks::run( 'isValidEmailAddr', [ $addr, &$result ] ) ) {
2093 return $result;
2094 }
2095
2096 // Please note strings below are enclosed in brackets [], this make the
2097 // hyphen "-" a range indicator. Hence it is double backslashed below.
2098 // See T28948
2099 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
2100 $rfc1034_ldh_str = "a-z0-9\\-";
2101
2102 $html5_email_regexp = "/
2103 ^ # start of string
2104 [$rfc5322_atext\\.]+ # user part which is liberal :p
2105 @ # 'apostrophe'
2106 [$rfc1034_ldh_str]+ # First domain part
2107 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
2108 $ # End of string
2109 /ix"; // case Insensitive, eXtended
2110
2111 return (bool)preg_match( $html5_email_regexp, $addr );
2112 }
2113}
$wgExperimentalHtmlIds
Abandoned experiment with HTML5-style ID escaping.
$wgAllowImageTag
A different approach to the above: simply allow the "<img>" tag to be used.
$wgFragmentMode
How should section IDs be encoded? This array can contain 1 or 2 elements, each of them can be one of...
$wgExternalInterwikiFragmentMode
Which ID escaping mode should be used for external interwiki links? See documentation for $wgFragment...
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
if( $line===false) $args
Definition cdb.php:63
MediaWiki exception.
static isEnabled()
Definition MWTidy.php:79
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:31
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition Sanitizer.php:36
static stripAllTags( $text)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
static cleanUrl( $url)
static validateAttributes( $attribs, $whitelist)
Take an array of attribute names and values and normalize or discard illegal values for the given whi...
static decCharReference( $codepoint)
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
static decodeChar( $codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
static decodeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
static normalizeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the equivalent numeric entity re...
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static removeHTMLcomments( $text)
Remove '', and everything between.
static encodeAttribute( $text)
Encode an attribute value for HTML output.
static hexCharReference( $codepoint)
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
const EVIL_URI_PATTERN
Blacklist for evil uris like javascript: WARNING: DO NOT use this in any place that actually requires...
Definition Sanitizer.php:56
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
static setupAttributeWhitelist()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
static normalizeCharReferencesCallback( $matches)
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
static cleanUrlCallback( $matches)
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
static escapeIdReferenceList( $referenceString, $options=[])
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
static cssDecodeCallback( $matches)
static normalizeWhitespace( $text)
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[], $warnCallback=null)
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
static validateCodepoint( $codepoint)
Returns true if a given Unicode codepoint is a valid character in both HTML5 and XML.
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
Definition Sanitizer.php:72
static $htmlEntities
List of all named character entities defined in HTML 4.01 https://www.w3.org/TR/html4/sgml/entities....
Definition Sanitizer.php:79
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
static getTagAttributeCallback( $set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
static escapeIdInternal( $id, $mode)
Helper for escapeIdFor*() functions.
static validateTag( $params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
static $htmlEntityAliases
Character entity aliases accepted by MediaWiki.
static escapeId( $id, $options=[])
Given a value, escape it so that it can be used in an id attribute and return it.
static attributeWhitelist( $element)
Fetch the whitelist of acceptable attributes for a given element name.
static decodeCharReferencesCallback( $matches)
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
static validateEmail( $addr)
Does a string look like an e-mail address?
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
Definition Sanitizer.php:64
const ELEMENT_BITS_REGEX
Acceptable tag name charset from HTML5 parsing spec https://www.w3.org/TR/html5/syntax....
Definition Sanitizer.php:46
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
const XMLNS_ATTRIBUTE_PATTERN
Definition Sanitizer.php:57
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
Unicode normalization routines for working with UTF-8 strings.
Definition UtfNormal.php:48
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition deferred.txt:11
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global then executing the whole list after the page is displayed We don t do anything smart like collating updates to the same table or such because the list is almost always going to have just one item on if so it s not worth the trouble Since there is a job queue in the jobs table
Definition deferred.txt:16
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the local content language as $wgContLang
Definition design.txt:57
the array() calling protocol came about after MediaWiki 1.4rc1.
static configuration should be added through ResourceLoaderGetConfigVars instead & $vars
Definition hooks.txt:2198
namespace being checked & $result
Definition hooks.txt:2293
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped & $options
Definition hooks.txt:1971
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses & $ret
Definition hooks.txt:1975
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that probably a stub it is not rendered in wiki pages or galleries in category pages allow injecting custom HTML after the section Any uses of the hook need to handle escaping see BaseTemplate::getToolbox and BaseTemplate::makeListItem for details on the format of individual items inside of this array or by returning and letting standard HTTP rendering take place modifiable or by returning false and taking over the output $out
Definition hooks.txt:862
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses & $html
Definition hooks.txt:1983
Allows to change the fields on the form that will be generated $name
Definition hooks.txt:302
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses after processing & $attribs
Definition hooks.txt:1984
usually copyright or history_copyright This message must be in HTML not wikitext if the section is included from a template $section
Definition hooks.txt:2990
$params