MediaWiki  master
Sanitizer.php
Go to the documentation of this file.
1 <?php
28 
33 class Sanitizer {
39  '/&([A-Za-z0-9\x80-\xff]+);
40  |&\#([0-9]+);
41  |&\#[xX]([0-9A-Fa-f]+);
42  |(&)/x';
43 
48  const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
49 
58  const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
59  const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
60 
66  const ID_PRIMARY = 0;
67 
74  const ID_FALLBACK = 1;
75 
81  private const HTML_ENTITIES = [
82  'Aacute' => 193,
83  'aacute' => 225,
84  'Acirc' => 194,
85  'acirc' => 226,
86  'acute' => 180,
87  'AElig' => 198,
88  'aelig' => 230,
89  'Agrave' => 192,
90  'agrave' => 224,
91  'alefsym' => 8501,
92  'Alpha' => 913,
93  'alpha' => 945,
94  'amp' => 38,
95  'and' => 8743,
96  'ang' => 8736,
97  'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
98  'Aring' => 197,
99  'aring' => 229,
100  'asymp' => 8776,
101  'Atilde' => 195,
102  'atilde' => 227,
103  'Auml' => 196,
104  'auml' => 228,
105  'bdquo' => 8222,
106  'Beta' => 914,
107  'beta' => 946,
108  'brvbar' => 166,
109  'bull' => 8226,
110  'cap' => 8745,
111  'Ccedil' => 199,
112  'ccedil' => 231,
113  'cedil' => 184,
114  'cent' => 162,
115  'Chi' => 935,
116  'chi' => 967,
117  'circ' => 710,
118  'clubs' => 9827,
119  'cong' => 8773,
120  'copy' => 169,
121  'crarr' => 8629,
122  'cup' => 8746,
123  'curren' => 164,
124  'dagger' => 8224,
125  'Dagger' => 8225,
126  'darr' => 8595,
127  'dArr' => 8659,
128  'deg' => 176,
129  'Delta' => 916,
130  'delta' => 948,
131  'diams' => 9830,
132  'divide' => 247,
133  'Eacute' => 201,
134  'eacute' => 233,
135  'Ecirc' => 202,
136  'ecirc' => 234,
137  'Egrave' => 200,
138  'egrave' => 232,
139  'empty' => 8709,
140  'emsp' => 8195,
141  'ensp' => 8194,
142  'Epsilon' => 917,
143  'epsilon' => 949,
144  'equiv' => 8801,
145  'Eta' => 919,
146  'eta' => 951,
147  'ETH' => 208,
148  'eth' => 240,
149  'Euml' => 203,
150  'euml' => 235,
151  'euro' => 8364,
152  'exist' => 8707,
153  'fnof' => 402,
154  'forall' => 8704,
155  'frac12' => 189,
156  'frac14' => 188,
157  'frac34' => 190,
158  'frasl' => 8260,
159  'Gamma' => 915,
160  'gamma' => 947,
161  'ge' => 8805,
162  'gt' => 62,
163  'harr' => 8596,
164  'hArr' => 8660,
165  'hearts' => 9829,
166  'hellip' => 8230,
167  'Iacute' => 205,
168  'iacute' => 237,
169  'Icirc' => 206,
170  'icirc' => 238,
171  'iexcl' => 161,
172  'Igrave' => 204,
173  'igrave' => 236,
174  'image' => 8465,
175  'infin' => 8734,
176  'int' => 8747,
177  'Iota' => 921,
178  'iota' => 953,
179  'iquest' => 191,
180  'isin' => 8712,
181  'Iuml' => 207,
182  'iuml' => 239,
183  'Kappa' => 922,
184  'kappa' => 954,
185  'Lambda' => 923,
186  'lambda' => 955,
187  'lang' => 9001,
188  'laquo' => 171,
189  'larr' => 8592,
190  'lArr' => 8656,
191  'lceil' => 8968,
192  'ldquo' => 8220,
193  'le' => 8804,
194  'lfloor' => 8970,
195  'lowast' => 8727,
196  'loz' => 9674,
197  'lrm' => 8206,
198  'lsaquo' => 8249,
199  'lsquo' => 8216,
200  'lt' => 60,
201  'macr' => 175,
202  'mdash' => 8212,
203  'micro' => 181,
204  'middot' => 183,
205  'minus' => 8722,
206  'Mu' => 924,
207  'mu' => 956,
208  'nabla' => 8711,
209  'nbsp' => 160,
210  'ndash' => 8211,
211  'ne' => 8800,
212  'ni' => 8715,
213  'not' => 172,
214  'notin' => 8713,
215  'nsub' => 8836,
216  'Ntilde' => 209,
217  'ntilde' => 241,
218  'Nu' => 925,
219  'nu' => 957,
220  'Oacute' => 211,
221  'oacute' => 243,
222  'Ocirc' => 212,
223  'ocirc' => 244,
224  'OElig' => 338,
225  'oelig' => 339,
226  'Ograve' => 210,
227  'ograve' => 242,
228  'oline' => 8254,
229  'Omega' => 937,
230  'omega' => 969,
231  'Omicron' => 927,
232  'omicron' => 959,
233  'oplus' => 8853,
234  'or' => 8744,
235  'ordf' => 170,
236  'ordm' => 186,
237  'Oslash' => 216,
238  'oslash' => 248,
239  'Otilde' => 213,
240  'otilde' => 245,
241  'otimes' => 8855,
242  'Ouml' => 214,
243  'ouml' => 246,
244  'para' => 182,
245  'part' => 8706,
246  'permil' => 8240,
247  'perp' => 8869,
248  'Phi' => 934,
249  'phi' => 966,
250  'Pi' => 928,
251  'pi' => 960,
252  'piv' => 982,
253  'plusmn' => 177,
254  'pound' => 163,
255  'prime' => 8242,
256  'Prime' => 8243,
257  'prod' => 8719,
258  'prop' => 8733,
259  'Psi' => 936,
260  'psi' => 968,
261  'quot' => 34,
262  'radic' => 8730,
263  'rang' => 9002,
264  'raquo' => 187,
265  'rarr' => 8594,
266  'rArr' => 8658,
267  'rceil' => 8969,
268  'rdquo' => 8221,
269  'real' => 8476,
270  'reg' => 174,
271  'rfloor' => 8971,
272  'Rho' => 929,
273  'rho' => 961,
274  'rlm' => 8207,
275  'rsaquo' => 8250,
276  'rsquo' => 8217,
277  'sbquo' => 8218,
278  'Scaron' => 352,
279  'scaron' => 353,
280  'sdot' => 8901,
281  'sect' => 167,
282  'shy' => 173,
283  'Sigma' => 931,
284  'sigma' => 963,
285  'sigmaf' => 962,
286  'sim' => 8764,
287  'spades' => 9824,
288  'sub' => 8834,
289  'sube' => 8838,
290  'sum' => 8721,
291  'sup' => 8835,
292  'sup1' => 185,
293  'sup2' => 178,
294  'sup3' => 179,
295  'supe' => 8839,
296  'szlig' => 223,
297  'Tau' => 932,
298  'tau' => 964,
299  'there4' => 8756,
300  'Theta' => 920,
301  'theta' => 952,
302  'thetasym' => 977,
303  'thinsp' => 8201,
304  'THORN' => 222,
305  'thorn' => 254,
306  'tilde' => 732,
307  'times' => 215,
308  'trade' => 8482,
309  'Uacute' => 218,
310  'uacute' => 250,
311  'uarr' => 8593,
312  'uArr' => 8657,
313  'Ucirc' => 219,
314  'ucirc' => 251,
315  'Ugrave' => 217,
316  'ugrave' => 249,
317  'uml' => 168,
318  'upsih' => 978,
319  'Upsilon' => 933,
320  'upsilon' => 965,
321  'Uuml' => 220,
322  'uuml' => 252,
323  'weierp' => 8472,
324  'Xi' => 926,
325  'xi' => 958,
326  'Yacute' => 221,
327  'yacute' => 253,
328  'yen' => 165,
329  'Yuml' => 376,
330  'yuml' => 255,
331  'Zeta' => 918,
332  'zeta' => 950,
333  'zwj' => 8205,
334  'zwnj' => 8204
335  ];
336 
340  private const HTML_ENTITY_ALIASES = [
341  'רלמ' => 'rlm',
342  'رلم' => 'rlm',
343  ];
344 
348  private static $attribsRegex;
349 
356  static function getAttribsRegex() {
357  if ( self::$attribsRegex === null ) {
358  $spaceChars = '\x09\x0a\x0c\x0d\x20';
359  $space = "[{$spaceChars}]";
360  $attrib = "[^{$spaceChars}\/>=]";
361  $attribFirst = "(?:{$attrib}|=)";
362  self::$attribsRegex =
363  "/({$attribFirst}{$attrib}*)
364  ($space*=$space*
365  (?:
366  # The attribute value: quoted or alone
367  \"([^\"]*)(?:\"|\$)
368  | '([^']*)(?:'|\$)
369  | (((?!$space|>).)*)
370  )
371  )?/sxu";
372  }
373  return self::$attribsRegex;
374  }
375 
379  private static $attribNameRegex;
380 
385  static function getAttribNameRegex() {
386  if ( self::$attribNameRegex === null ) {
387  $attribFirst = "[:_\p{L}\p{N}]";
388  $attrib = "[:_\.\-\p{L}\p{N}]";
389  self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
390  }
391  return self::$attribNameRegex;
392  }
393 
400  public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
401  global $wgAllowImageTag;
402 
403  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
404  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
405 
406  // Base our staticInitialised variable off of the global config state so that if the globals
407  // are changed (like in the screwed up test system) we will re-initialise the settings.
408  $globalContext = $wgAllowImageTag;
409  if ( !$staticInitialised || $staticInitialised != $globalContext ) {
410  $htmlpairsStatic = [ # Tags that must be closed
411  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
412  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
413  'strike', 'strong', 'tt', 'var', 'div', 'center',
414  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
415  'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
416  'kbd', 'samp', 'data', 'time', 'mark'
417  ];
418  $htmlsingle = [
419  'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
420  ];
421 
422  # Elements that cannot have close tags. This is (not coincidentally)
423  # also the list of tags for which the HTML 5 parsing algorithm
424  # requires you to "acknowledge the token's self-closing flag", i.e.
425  # a self-closing tag like <br/> is not an HTML 5 parse error only
426  # for this list.
427  $htmlsingleonly = [
428  'br', 'wbr', 'hr', 'meta', 'link'
429  ];
430 
431  $htmlnest = [ # Tags that can be nested--??
432  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
433  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
434  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
435  ];
436  $tabletags = [ # Can only appear inside table, we will close them
437  'td', 'th', 'tr',
438  ];
439  $htmllist = [ # Tags used by list
440  'ul', 'ol',
441  ];
442  $listtags = [ # Tags that can appear in a list
443  'li',
444  ];
445 
446  if ( $wgAllowImageTag ) {
447  $htmlsingle[] = 'img';
448  $htmlsingleonly[] = 'img';
449  }
450 
451  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
452  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
453 
454  # Convert them all to hashtables for faster lookup
455  $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
456  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
457  foreach ( $vars as $var ) {
458  $$var = array_flip( $$var );
459  }
460  $staticInitialised = $globalContext;
461  }
462 
463  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
464  $extratags = array_flip( $extratags );
465  $removetags = array_flip( $removetags );
466  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
467  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
468 
469  return [
470  'htmlpairs' => $htmlpairs,
471  'htmlsingle' => $htmlsingle,
472  'htmlsingleonly' => $htmlsingleonly,
473  'htmlnest' => $htmlnest,
474  'tabletags' => $tabletags,
475  'htmllist' => $htmllist,
476  'listtags' => $listtags,
477  'htmlsingleallowed' => $htmlsingleallowed,
478  'htmlelements' => $htmlelements,
479  ];
480  }
481 
497  public static function removeHTMLtags( $text, $processCallback = null,
498  $args = [], $extratags = [], $removetags = [], $warnCallback = null
499  ) {
500  $tagData = self::getRecognizedTagData( $extratags, $removetags );
501  $htmlpairs = $tagData['htmlpairs'];
502  $htmlsingle = $tagData['htmlsingle'];
503  $htmlsingleonly = $tagData['htmlsingleonly'];
504  $htmlnest = $tagData['htmlnest'];
505  $tabletags = $tagData['tabletags'];
506  $htmllist = $tagData['htmllist'];
507  $listtags = $tagData['listtags'];
508  $htmlsingleallowed = $tagData['htmlsingleallowed'];
509  $htmlelements = $tagData['htmlelements'];
510 
511  # Remove HTML comments
512  $text = self::removeHTMLcomments( $text );
513  $bits = explode( '<', $text );
514  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
515  if ( !MWTidy::isEnabled() ) {
516  wfDeprecated( 'disabling tidy', '1.33' );
517  $tagstack = $tablestack = [];
518  foreach ( $bits as $x ) {
519  $regs = [];
520  # $slash: Does the current element start with a '/'?
521  # $t: Current element name
522  # $params: String between element name and >
523  # $brace: Ending '>' or '/>'
524  # $rest: Everything until the next element of $bits
525  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
526  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
527  } else {
528  $slash = $t = $params = $brace = $rest = null;
529  }
530 
531  $badtag = false;
532  $t = strtolower( $t );
533  if ( isset( $htmlelements[$t] ) ) {
534  # Check our stack
535  if ( $slash && isset( $htmlsingleonly[$t] ) ) {
536  $badtag = true;
537  } elseif ( $slash ) {
538  # Closing a tag... is it the one we just opened?
539  Wikimedia\suppressWarnings();
540  $ot = array_pop( $tagstack );
541  Wikimedia\restoreWarnings();
542 
543  if ( $ot != $t ) {
544  if ( isset( $htmlsingleallowed[$ot] ) ) {
545  # Pop all elements with an optional close tag
546  # and see if we find a match below them
547  $optstack = [];
548  array_push( $optstack, $ot );
549  Wikimedia\suppressWarnings();
550  $ot = array_pop( $tagstack );
551  Wikimedia\restoreWarnings();
552  while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
553  array_push( $optstack, $ot );
554  Wikimedia\suppressWarnings();
555  $ot = array_pop( $tagstack );
556  Wikimedia\restoreWarnings();
557  }
558  if ( $t != $ot ) {
559  # No match. Push the optional elements back again
560  $badtag = true;
561  Wikimedia\suppressWarnings();
562  $ot = array_pop( $optstack );
563  Wikimedia\restoreWarnings();
564  while ( $ot ) {
565  array_push( $tagstack, $ot );
566  Wikimedia\suppressWarnings();
567  $ot = array_pop( $optstack );
568  Wikimedia\restoreWarnings();
569  }
570  }
571  } else {
572  Wikimedia\suppressWarnings();
573  array_push( $tagstack, $ot );
574  Wikimedia\restoreWarnings();
575 
576  # <li> can be nested in <ul> or <ol>, skip those cases:
577  if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
578  $badtag = true;
579  }
580  }
581  } elseif ( $t == 'table' ) {
582  $tagstack = array_pop( $tablestack );
583  }
584  $newparams = '';
585  } else {
586  # Keep track for later
587  if ( isset( $tabletags[$t] ) && !in_array( 'table', $tagstack ) ) {
588  $badtag = true;
589  } elseif ( in_array( $t, $tagstack ) && !isset( $htmlnest[$t] ) ) {
590  $badtag = true;
591  #  Is it a self closed htmlpair ? (T7487)
592  } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) {
593  // Eventually we'll just remove the self-closing
594  // slash, in order to be consistent with HTML5
595  // semantics.
596  // $brace = '>';
597  // For now, let's just warn authors to clean up.
598  if ( is_callable( $warnCallback ) ) {
599  call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
600  }
601  $badtag = true;
602  } elseif ( isset( $htmlsingleonly[$t] ) ) {
603  # Hack to force empty tag for unclosable elements
604  $brace = '/>';
605  } elseif ( isset( $htmlsingle[$t] ) ) {
606  # Hack to not close $htmlsingle tags
607  $brace = null;
608  # Still need to push this optionally-closed tag to
609  # the tag stack so that we can match end tags
610  # instead of marking them as bad.
611  array_push( $tagstack, $t );
612  } elseif ( isset( $tabletags[$t] ) && in_array( $t, $tagstack ) ) {
613  // New table tag but forgot to close the previous one
614  $text .= "</$t>";
615  } else {
616  if ( $t == 'table' ) {
617  array_push( $tablestack, $tagstack );
618  $tagstack = [];
619  }
620  array_push( $tagstack, $t );
621  }
622 
623  # Replace any variables or template parameters with
624  # plaintext results.
625  if ( is_callable( $processCallback ) ) {
626  call_user_func_array( $processCallback, [ &$params, $args ] );
627  }
628 
629  if ( !self::validateTag( $params, $t ) ) {
630  $badtag = true;
631  }
632 
633  # Strip non-approved attributes from the tag
634  $newparams = self::fixTagAttributes( $params, $t );
635  }
636  if ( !$badtag ) {
637  $rest = str_replace( '>', '&gt;', $rest );
638  $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
639  $text .= "<$slash$t$newparams$close>$rest";
640  continue;
641  }
642  }
643  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
644  }
645  # Close off any remaining tags
646  while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
647  $text .= "</$t>\n";
648  if ( $t == 'table' ) {
649  $tagstack = array_pop( $tablestack );
650  }
651  }
652  } else {
653  # this might be possible using tidy itself
654  foreach ( $bits as $x ) {
655  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
656  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
657 
658  $badtag = false;
659  $t = strtolower( $t );
660  if ( isset( $htmlelements[$t] ) ) {
661  if ( is_callable( $processCallback ) ) {
662  call_user_func_array( $processCallback, [ &$params, $args ] );
663  }
664 
665  if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
666  // Eventually we'll just remove the self-closing
667  // slash, in order to be consistent with HTML5
668  // semantics.
669  // $brace = '>';
670  // For now, let's just warn authors to clean up.
671  if ( is_callable( $warnCallback ) ) {
672  call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
673  }
674  }
675  if ( !self::validateTag( $params, $t ) ) {
676  $badtag = true;
677  }
678 
679  $newparams = self::fixTagAttributes( $params, $t );
680  if ( !$badtag ) {
681  if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
682  # Interpret self-closing tags as empty tags even when
683  # HTML 5 would interpret them as start tags. Such input
684  # is commonly seen on Wikimedia wikis with this intention.
685  $brace = "></$t>";
686  }
687 
688  $rest = str_replace( '>', '&gt;', $rest );
689  $text .= "<$slash$t$newparams$brace$rest";
690  continue;
691  }
692  }
693  }
694  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
695  }
696  }
697  return $text;
698  }
699 
709  public static function removeHTMLcomments( $text ) {
710  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
711  $end = strpos( $text, '-->', $start + 4 );
712  if ( $end === false ) {
713  # Unterminated comment; bail out
714  break;
715  }
716 
717  $end += 3;
718 
719  # Trim space and newline if the comment is both
720  # preceded and followed by a newline
721  $spaceStart = max( $start - 1, 0 );
722  $spaceLen = $end - $spaceStart;
723  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
724  $spaceStart--;
725  $spaceLen++;
726  }
727  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
728  $spaceLen++;
729  }
730  if ( substr( $text, $spaceStart, 1 ) === "\n"
731  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
732  # Remove the comment, leading and trailing
733  # spaces, and leave only one newline.
734  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
735  } else {
736  # Remove just the comment.
737  $text = substr_replace( $text, '', $start, $end - $start );
738  }
739  }
740  return $text;
741  }
742 
755  static function validateTag( $params, $element ) {
756  $params = self::decodeTagAttributes( $params );
757 
758  if ( $element == 'meta' || $element == 'link' ) {
759  if ( !isset( $params['itemprop'] ) ) {
760  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
761  return false;
762  }
763  if ( $element == 'meta' && !isset( $params['content'] ) ) {
764  // <meta> must have a content="" for the itemprop
765  return false;
766  }
767  if ( $element == 'link' && !isset( $params['href'] ) ) {
768  // <link> must have an associated href=""
769  return false;
770  }
771  }
772 
773  return true;
774  }
775 
791  static function validateTagAttributes( $attribs, $element ) {
792  return self::validateAttributes( $attribs,
793  self::attributeWhitelistInternal( $element ) );
794  }
795 
813  static function validateAttributes( $attribs, $whitelist ) {
814  if ( isset( $whitelist[0] ) ) {
815  // We would like to eventually deprecate calling this
816  // function with a sequential array, but for now just
817  // convert it.
818  $whitelist = array_flip( $whitelist );
819  }
820  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
821 
822  $out = [];
823  foreach ( $attribs as $attribute => $value ) {
824  # Allow XML namespace declaration to allow RDFa
825  if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
826  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
827  $out[$attribute] = $value;
828  }
829 
830  continue;
831  }
832 
833  # Allow any attribute beginning with "data-"
834  # However:
835  # * Disallow data attributes used by MediaWiki code
836  # * Ensure that the attribute is not namespaced by banning
837  # colons.
838  if ( (
839  !preg_match( '/^data-[^:]*$/i', $attribute ) &&
840  !array_key_exists( $attribute, $whitelist )
841  ) || self::isReservedDataAttribute( $attribute ) ) {
842  continue;
843  }
844 
845  # Strip javascript "expression" from stylesheets.
846  # https://msdn.microsoft.com/en-us/library/ms537634.aspx
847  if ( $attribute == 'style' ) {
848  $value = self::checkCss( $value );
849  }
850 
851  # Escape HTML id attributes
852  if ( $attribute === 'id' ) {
853  $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
854  }
855 
856  # Escape HTML id reference lists
857  if ( $attribute === 'aria-describedby'
858  || $attribute === 'aria-flowto'
859  || $attribute === 'aria-labelledby'
860  || $attribute === 'aria-owns'
861  ) {
862  $value = self::escapeIdReferenceList( $value );
863  }
864 
865  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
866  // Check them for sanity.
867  if ( $attribute === 'rel' || $attribute === 'rev'
868  # RDFa
869  || $attribute === 'about' || $attribute === 'property'
870  || $attribute === 'resource' || $attribute === 'datatype'
871  || $attribute === 'typeof'
872  # HTML5 microdata
873  || $attribute === 'itemid' || $attribute === 'itemprop'
874  || $attribute === 'itemref' || $attribute === 'itemscope'
875  || $attribute === 'itemtype'
876  ) {
877  // Paranoia. Allow "simple" values but suppress javascript
878  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
879  continue;
880  }
881  }
882 
883  # NOTE: even though elements using href/src are not allowed directly, supply
884  # validation code that can be used by tag hook handlers, etc
885  if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
886  if ( !preg_match( $hrefExp, $value ) ) {
887  continue; // drop any href or src attributes not using an allowed protocol.
888  // NOTE: this also drops all relative URLs
889  }
890  }
891 
892  // If this attribute was previously set, override it.
893  // Output should only have one attribute of each name.
894  $out[$attribute] = $value;
895  }
896 
897  # itemtype, itemid, itemref don't make sense without itemscope
898  if ( !array_key_exists( 'itemscope', $out ) ) {
899  unset( $out['itemtype'] );
900  unset( $out['itemid'] );
901  unset( $out['itemref'] );
902  }
903  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
904 
905  return $out;
906  }
907 
915  public static function isReservedDataAttribute( $attr ) {
916  // data-ooui is reserved for ooui.
917  // data-mw and data-parsoid are reserved for parsoid.
918  // data-mw-<name here> is reserved for extensions (or core) if
919  // they need to communicate some data to the client and want to be
920  // sure that it isn't coming from an untrusted user.
921  // We ignore the possibility of namespaces since user-generated HTML
922  // can't use them anymore.
923  return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
924  }
925 
936  static function mergeAttributes( $a, $b ) {
937  $out = array_merge( $a, $b );
938  if ( isset( $a['class'] ) && isset( $b['class'] )
939  && is_string( $a['class'] ) && is_string( $b['class'] )
940  && $a['class'] !== $b['class']
941  ) {
942  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
943  -1, PREG_SPLIT_NO_EMPTY );
944  $out['class'] = implode( ' ', array_unique( $classes ) );
945  }
946  return $out;
947  }
948 
957  public static function normalizeCss( $value ) {
958  // Decode character references like &#123;
959  $value = self::decodeCharReferences( $value );
960 
961  // Decode escape sequences and line continuation
962  // See the grammar in the CSS 2 spec, appendix D.
963  // This has to be done AFTER decoding character references.
964  // This means it isn't possible for this function to return
965  // unsanitized escape sequences. It is possible to manufacture
966  // input that contains character references that decode to
967  // escape sequences that decode to character references, but
968  // it's OK for the return value to contain character references
969  // because the caller is supposed to escape those anyway.
970  static $decodeRegex;
971  if ( !$decodeRegex ) {
972  $space = '[\\x20\\t\\r\\n\\f]';
973  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
974  $backslash = '\\\\';
975  $decodeRegex = "/ $backslash
976  (?:
977  ($nl) | # 1. Line continuation
978  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
979  (.) | # 3. backslash cancelling special meaning
980  () | # 4. backslash at end of string
981  )/xu";
982  }
983  $value = preg_replace_callback( $decodeRegex,
984  [ __CLASS__, 'cssDecodeCallback' ], $value );
985 
986  // Let the value through if it's nothing but a single comment, to
987  // allow other functions which may reject it to pass some error
988  // message through.
989  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
990  // Remove any comments; IE gets token splitting wrong
991  // This must be done AFTER decoding character references and
992  // escape sequences, because those steps can introduce comments
993  // This step cannot introduce character references or escape
994  // sequences, because it replaces comments with spaces rather
995  // than removing them completely.
996  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
997 
998  // Remove anything after a comment-start token, to guard against
999  // incorrect client implementations.
1000  $commentPos = strpos( $value, '/*' );
1001  if ( $commentPos !== false ) {
1002  $value = substr( $value, 0, $commentPos );
1003  }
1004  }
1005 
1006  return $value;
1007  }
1008 
1027  static function checkCss( $value ) {
1028  $value = self::normalizeCss( $value );
1029 
1030  // Reject problematic keywords and control characters
1031  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
1032  strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
1033  return '/* invalid control char */';
1034  } elseif ( preg_match(
1035  '! expression
1036  | filter\s*:
1037  | accelerator\s*:
1038  | -o-link\s*:
1039  | -o-link-source\s*:
1040  | -o-replace\s*:
1041  | url\s*\(
1042  | image\s*\(
1043  | image-set\s*\(
1044  | attr\s*\([^)]+[\s,]+url
1045  | var\s*\(
1046  !ix', $value ) ) {
1047  return '/* insecure input */';
1048  }
1049  return $value;
1050  }
1051 
1056  static function cssDecodeCallback( $matches ) {
1057  if ( $matches[1] !== '' ) {
1058  // Line continuation
1059  return '';
1060  } elseif ( $matches[2] !== '' ) {
1061  $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
1062  } elseif ( $matches[3] !== '' ) {
1063  $char = $matches[3];
1064  } else {
1065  $char = '\\';
1066  }
1067  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
1068  // These characters need to be escaped in strings
1069  // Clean up the escape sequence to avoid parsing errors by clients
1070  return '\\' . dechex( ord( $char ) ) . ' ';
1071  } else {
1072  // Decode unnecessary escape
1073  return $char;
1074  }
1075  }
1076 
1098  static function fixTagAttributes( $text, $element, $sorted = false ) {
1099  if ( trim( $text ) == '' ) {
1100  return '';
1101  }
1102 
1103  $decoded = self::decodeTagAttributes( $text );
1104  $stripped = self::validateTagAttributes( $decoded, $element );
1105 
1106  if ( $sorted ) {
1107  ksort( $stripped );
1108  }
1109 
1110  return self::safeEncodeTagAttributes( $stripped );
1111  }
1112 
1118  static function encodeAttribute( $text ) {
1119  $encValue = htmlspecialchars( $text, ENT_QUOTES );
1120 
1121  // Whitespace is normalized during attribute decoding,
1122  // so if we've been passed non-spaces we must encode them
1123  // ahead of time or they won't be preserved.
1124  $encValue = strtr( $encValue, [
1125  "\n" => '&#10;',
1126  "\r" => '&#13;',
1127  "\t" => '&#9;',
1128  ] );
1129 
1130  return $encValue;
1131  }
1132 
1141  public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
1142  // Replace $ with \$ and \ with \\
1143  $space = preg_replace( '#(?<!\\\\)(\\$|\\\\)#', '\\\\$1', $space );
1144  $fixtags = [
1145  # French spaces, last one Guillemet-left
1146  # only if there is something before the space
1147  # and a non-word character after the punctuation.
1148  '/(?<=\S) (?=[?:;!%»›](?!\w))/u' => "$space",
1149  # French spaces, Guillemet-right
1150  '/([«‹]) /u' => "\\1$space",
1151  ];
1152  return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
1153  }
1154 
1161  static function safeEncodeAttribute( $text ) {
1162  $encValue = self::encodeAttribute( $text );
1163 
1164  # Templates and links may be expanded in later parsing,
1165  # creating invalid or dangerous output. Suppress this.
1166  $encValue = strtr( $encValue, [
1167  '<' => '&lt;', // This should never happen,
1168  '>' => '&gt;', // we've received invalid input
1169  '"' => '&quot;', // which should have been escaped.
1170  '{' => '&#123;',
1171  '}' => '&#125;', // prevent unpaired language conversion syntax
1172  '[' => '&#91;',
1173  ']' => '&#93;',
1174  "''" => '&#39;&#39;',
1175  'ISBN' => '&#73;SBN',
1176  'RFC' => '&#82;FC',
1177  'PMID' => '&#80;MID',
1178  '|' => '&#124;',
1179  '__' => '&#95;_',
1180  ] );
1181 
1182  # Armor against French spaces detection (T5158)
1183  $encValue = self::armorFrenchSpaces( $encValue, '&#32;' );
1184 
1185  # Stupid hack
1186  $encValue = preg_replace_callback(
1187  '/((?i)' . wfUrlProtocols() . ')/',
1188  function ( $matches ) {
1189  return str_replace( ':', '&#58;', $matches[1] );
1190  },
1191  $encValue );
1192  return $encValue;
1193  }
1194 
1223  static function escapeId( $id, $options = [] ) {
1224  $options = (array)$options;
1225 
1226  // HTML4-style escaping
1227  static $replace = [
1228  '%3A' => ':',
1229  '%' => '.'
1230  ];
1231 
1232  $id = urlencode( strtr( $id, ' ', '_' ) );
1233  $id = strtr( $id, $replace );
1234 
1235  if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) {
1236  // Initial character must be a letter!
1237  $id = "x$id";
1238  }
1239  return $id;
1240  }
1241 
1257  public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
1258  global $wgFragmentMode;
1259 
1260  if ( !isset( $wgFragmentMode[$mode] ) ) {
1261  if ( $mode === self::ID_PRIMARY ) {
1262  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
1263  }
1264  return false;
1265  }
1266 
1267  $internalMode = $wgFragmentMode[$mode];
1268 
1269  return self::escapeIdInternal( $id, $internalMode );
1270  }
1271 
1284  public static function escapeIdForLink( $id ) {
1285  global $wgFragmentMode;
1286 
1287  if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
1288  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
1289  }
1290 
1291  $mode = $wgFragmentMode[self::ID_PRIMARY];
1292 
1293  $id = self::escapeIdInternal( $id, $mode );
1294 
1295  return $id;
1296  }
1297 
1307  public static function escapeIdForExternalInterwiki( $id ) {
1309 
1310  $id = self::escapeIdInternal( $id, $wgExternalInterwikiFragmentMode );
1311 
1312  return $id;
1313  }
1314 
1322  private static function escapeIdInternal( $id, $mode ) {
1323  switch ( $mode ) {
1324  case 'html5':
1325  $id = str_replace( ' ', '_', $id );
1326  break;
1327  case 'legacy':
1328  // This corresponds to 'noninitial' mode of the old escapeId()
1329  static $replace = [
1330  '%3A' => ':',
1331  '%' => '.'
1332  ];
1333 
1334  $id = urlencode( str_replace( ' ', '_', $id ) );
1335  $id = strtr( $id, $replace );
1336  break;
1337  default:
1338  throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1339  }
1340 
1341  return $id;
1342  }
1343 
1353  public static function escapeIdReferenceList( $referenceString ) {
1354  # Explode the space delimited list string into an array of tokens
1355  $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1356 
1357  # Escape each token as an id
1358  foreach ( $references as &$ref ) {
1359  $ref = self::escapeIdForAttribute( $ref );
1360  }
1361 
1362  # Merge the array back to a space delimited list string
1363  # If the array is empty, the result will be an empty string ('')
1364  $referenceString = implode( ' ', $references );
1365 
1366  return $referenceString;
1367  }
1368 
1380  static function escapeClass( $class ) {
1381  // Convert ugly stuff to underscores and kill underscores in ugly places
1382  return rtrim( preg_replace(
1383  [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1384  '_',
1385  $class ), '_' );
1386  }
1387 
1395  static function escapeHtmlAllowEntities( $html ) {
1396  $html = self::decodeCharReferences( $html );
1397  # It seems wise to escape ' as well as ", as a matter of course. Can't
1398  # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1399  # don't cause the entire string to disappear.
1400  $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1401  return $html;
1402  }
1403 
1412  public static function decodeTagAttributes( $text ) {
1413  if ( trim( $text ) == '' ) {
1414  return [];
1415  }
1416 
1417  $pairs = [];
1418  if ( !preg_match_all(
1419  self::getAttribsRegex(),
1420  $text,
1421  $pairs,
1422  PREG_SET_ORDER ) ) {
1423  return [];
1424  }
1425 
1426  $attribs = [];
1427  foreach ( $pairs as $set ) {
1428  $attribute = strtolower( $set[1] );
1429 
1430  // Filter attribute names with unacceptable characters
1431  if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1432  continue;
1433  }
1434 
1435  $value = self::getTagAttributeCallback( $set );
1436 
1437  // Normalize whitespace
1438  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1439  $value = trim( $value );
1440 
1441  // Decode character references
1442  $attribs[$attribute] = self::decodeCharReferences( $value );
1443  }
1444  return $attribs;
1445  }
1446 
1454  public static function safeEncodeTagAttributes( $assoc_array ) {
1455  $attribs = [];
1456  foreach ( $assoc_array as $attribute => $value ) {
1457  $encAttribute = htmlspecialchars( $attribute );
1458  $encValue = self::safeEncodeAttribute( $value );
1459 
1460  $attribs[] = "$encAttribute=\"$encValue\"";
1461  }
1462  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1463  }
1464 
1473  private static function getTagAttributeCallback( $set ) {
1474  if ( isset( $set[5] ) ) {
1475  # No quotes.
1476  return $set[5];
1477  } elseif ( isset( $set[4] ) ) {
1478  # Single-quoted
1479  return $set[4];
1480  } elseif ( isset( $set[3] ) ) {
1481  # Double-quoted
1482  return $set[3];
1483  } elseif ( !isset( $set[2] ) ) {
1484  # In XHTML, attributes must have a value so return an empty string.
1485  # See "Empty attribute syntax",
1486  # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1487  return "";
1488  } else {
1489  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1490  }
1491  }
1492 
1497  private static function normalizeWhitespace( $text ) {
1498  return trim( preg_replace(
1499  '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1500  ' ',
1501  $text ) );
1502  }
1503 
1512  static function normalizeSectionNameWhitespace( $section ) {
1513  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1514  }
1515 
1531  static function normalizeCharReferences( $text ) {
1532  return preg_replace_callback(
1533  self::CHAR_REFS_REGEX,
1534  [ self::class, 'normalizeCharReferencesCallback' ],
1535  $text );
1536  }
1537 
1543  $ret = null;
1544  if ( $matches[1] != '' ) {
1545  $ret = self::normalizeEntity( $matches[1] );
1546  } elseif ( $matches[2] != '' ) {
1547  $ret = self::decCharReference( $matches[2] );
1548  } elseif ( $matches[3] != '' ) {
1549  $ret = self::hexCharReference( $matches[3] );
1550  }
1551  if ( is_null( $ret ) ) {
1552  return htmlspecialchars( $matches[0] );
1553  } else {
1554  return $ret;
1555  }
1556  }
1557 
1568  static function normalizeEntity( $name ) {
1569  if ( isset( self::HTML_ENTITY_ALIASES[$name] ) ) {
1570  return '&' . self::HTML_ENTITY_ALIASES[$name] . ';';
1571  } elseif ( in_array( $name, [ 'lt', 'gt', 'amp', 'quot' ] ) ) {
1572  return "&$name;";
1573  } elseif ( isset( self::HTML_ENTITIES[$name] ) ) {
1574  return '&#' . self::HTML_ENTITIES[$name] . ';';
1575  } else {
1576  return "&amp;$name;";
1577  }
1578  }
1579 
1584  static function decCharReference( $codepoint ) {
1585  $point = intval( $codepoint );
1586  if ( self::validateCodepoint( $point ) ) {
1587  return sprintf( '&#%d;', $point );
1588  } else {
1589  return null;
1590  }
1591  }
1592 
1597  static function hexCharReference( $codepoint ) {
1598  $point = hexdec( $codepoint );
1599  if ( self::validateCodepoint( $point ) ) {
1600  return sprintf( '&#x%x;', $point );
1601  } else {
1602  return null;
1603  }
1604  }
1605 
1612  private static function validateCodepoint( $codepoint ) {
1613  # U+000C is valid in HTML5 but not allowed in XML.
1614  # U+000D is valid in XML but not allowed in HTML5.
1615  # U+007F - U+009F are disallowed in HTML5 (control characters).
1616  return $codepoint == 0x09
1617  || $codepoint == 0x0a
1618  || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1619  || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1620  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1621  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1622  }
1623 
1631  public static function decodeCharReferences( $text ) {
1632  return preg_replace_callback(
1633  self::CHAR_REFS_REGEX,
1634  [ self::class, 'decodeCharReferencesCallback' ],
1635  $text );
1636  }
1637 
1648  public static function decodeCharReferencesAndNormalize( $text ) {
1649  $text = preg_replace_callback(
1650  self::CHAR_REFS_REGEX,
1651  [ self::class, 'decodeCharReferencesCallback' ],
1652  $text,
1653  -1, //limit
1654  $count
1655  );
1656 
1657  if ( $count ) {
1658  return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1659  } else {
1660  return $text;
1661  }
1662  }
1663 
1669  if ( $matches[1] != '' ) {
1670  return self::decodeEntity( $matches[1] );
1671  } elseif ( $matches[2] != '' ) {
1672  return self::decodeChar( intval( $matches[2] ) );
1673  } elseif ( $matches[3] != '' ) {
1674  return self::decodeChar( hexdec( $matches[3] ) );
1675  }
1676  # Last case should be an ampersand by itself
1677  return $matches[0];
1678  }
1679 
1687  static function decodeChar( $codepoint ) {
1688  if ( self::validateCodepoint( $codepoint ) ) {
1689  return UtfNormal\Utils::codepointToUtf8( $codepoint );
1690  } else {
1691  return UtfNormal\Constants::UTF8_REPLACEMENT;
1692  }
1693  }
1694 
1703  static function decodeEntity( $name ) {
1704  if ( isset( self::HTML_ENTITY_ALIASES[$name] ) ) {
1705  $name = self::HTML_ENTITY_ALIASES[$name];
1706  }
1707  if ( isset( self::HTML_ENTITIES[$name] ) ) {
1708  return UtfNormal\Utils::codepointToUtf8( self::HTML_ENTITIES[$name] );
1709  } else {
1710  return "&$name;";
1711  }
1712  }
1713 
1721  static function attributeWhitelist( $element ) {
1722  wfDeprecated( __METHOD__, '1.34' );
1723  $list = self::setupAttributeWhitelist();
1724  return $list[$element] ?? [];
1725  }
1726 
1734  private static function attributeWhitelistInternal( $element ) {
1735  $list = self::setupAttributeWhitelistInternal();
1736  return $list[$element] ?? [];
1737  }
1738 
1745  static function setupAttributeWhitelist() {
1746  wfDeprecated( __METHOD__, '1.34' );
1747  $wlist = self::setupAttributeWhitelistInternal();
1748  // This method is expected to return a sequential array as the
1749  // value for each HTML element key.
1750  return array_map( function ( $v ) {
1751  return array_keys( $v );
1752  }, $wlist );
1753  }
1754 
1762  private static function setupAttributeWhitelistInternal() {
1763  static $whitelist;
1764 
1765  if ( $whitelist !== null ) {
1766  return $whitelist;
1767  }
1768 
1769  // For lookup efficiency flip each attributes array so the keys are
1770  // the valid attributes.
1771  $merge = function ( $a, $b, $c = [] ) {
1772  return array_merge( $a, array_flip( $b ), array_flip( $c ) );
1773  };
1774  $common = $merge( [], [
1775  # HTML
1776  'id',
1777  'class',
1778  'style',
1779  'lang',
1780  'dir',
1781  'title',
1782 
1783  # WAI-ARIA
1784  'aria-describedby',
1785  'aria-flowto',
1786  'aria-label',
1787  'aria-labelledby',
1788  'aria-owns',
1789  'role',
1790 
1791  # RDFa
1792  # These attributes are specified in section 9 of
1793  # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1794  'about',
1795  'property',
1796  'resource',
1797  'datatype',
1798  'typeof',
1799 
1800  # Microdata. These are specified by
1801  # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1802  'itemid',
1803  'itemprop',
1804  'itemref',
1805  'itemscope',
1806  'itemtype',
1807  ] );
1808 
1809  $block = $merge( $common, [ 'align' ] );
1810 
1811  $tablealign = [ 'align', 'valign' ];
1812  $tablecell = [
1813  'abbr',
1814  'axis',
1815  'headers',
1816  'scope',
1817  'rowspan',
1818  'colspan',
1819  'nowrap', # deprecated
1820  'width', # deprecated
1821  'height', # deprecated
1822  'bgcolor', # deprecated
1823  ];
1824 
1825  # Numbers refer to sections in HTML 4.01 standard describing the element.
1826  # See: https://www.w3.org/TR/html4/
1827  $whitelist = [
1828  # 7.5.4
1829  'div' => $block,
1830  'center' => $common, # deprecated
1831  'span' => $common,
1832 
1833  # 7.5.5
1834  'h1' => $block,
1835  'h2' => $block,
1836  'h3' => $block,
1837  'h4' => $block,
1838  'h5' => $block,
1839  'h6' => $block,
1840 
1841  # 7.5.6
1842  # address
1843 
1844  # 8.2.4
1845  'bdo' => $common,
1846 
1847  # 9.2.1
1848  'em' => $common,
1849  'strong' => $common,
1850  'cite' => $common,
1851  'dfn' => $common,
1852  'code' => $common,
1853  'samp' => $common,
1854  'kbd' => $common,
1855  'var' => $common,
1856  'abbr' => $common,
1857  # acronym
1858 
1859  # 9.2.2
1860  'blockquote' => $merge( $common, [ 'cite' ] ),
1861  'q' => $merge( $common, [ 'cite' ] ),
1862 
1863  # 9.2.3
1864  'sub' => $common,
1865  'sup' => $common,
1866 
1867  # 9.3.1
1868  'p' => $block,
1869 
1870  # 9.3.2
1871  'br' => $merge( $common, [ 'clear' ] ),
1872 
1873  # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1874  'wbr' => $common,
1875 
1876  # 9.3.4
1877  'pre' => $merge( $common, [ 'width' ] ),
1878 
1879  # 9.4
1880  'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1881  'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1882 
1883  # 10.2
1884  'ul' => $merge( $common, [ 'type' ] ),
1885  'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1886  'li' => $merge( $common, [ 'type', 'value' ] ),
1887 
1888  # 10.3
1889  'dl' => $common,
1890  'dd' => $common,
1891  'dt' => $common,
1892 
1893  # 11.2.1
1894  'table' => $merge( $common,
1895  [ 'summary', 'width', 'border', 'frame',
1896  'rules', 'cellspacing', 'cellpadding',
1897  'align', 'bgcolor',
1898  ] ),
1899 
1900  # 11.2.2
1901  'caption' => $block,
1902 
1903  # 11.2.3
1904  'thead' => $common,
1905  'tfoot' => $common,
1906  'tbody' => $common,
1907 
1908  # 11.2.4
1909  'colgroup' => $merge( $common, [ 'span' ] ),
1910  'col' => $merge( $common, [ 'span' ] ),
1911 
1912  # 11.2.5
1913  'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1914 
1915  # 11.2.6
1916  'td' => $merge( $common, $tablecell, $tablealign ),
1917  'th' => $merge( $common, $tablecell, $tablealign ),
1918 
1919  # 12.2
1920  # NOTE: <a> is not allowed directly, but the attrib
1921  # whitelist is used from the Parser object
1922  'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1923 
1924  # 13.2
1925  # Not usually allowed, but may be used for extension-style hooks
1926  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1927  # true
1928  'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1929  # Attributes for A/V tags added in T163583 / T133673
1930  'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1931  'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1932  'source' => $merge( $common, [ 'type', 'src' ] ),
1933  'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1934 
1935  # 15.2.1
1936  'tt' => $common,
1937  'b' => $common,
1938  'i' => $common,
1939  'big' => $common,
1940  'small' => $common,
1941  'strike' => $common,
1942  's' => $common,
1943  'u' => $common,
1944 
1945  # 15.2.2
1946  'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1947  # basefont
1948 
1949  # 15.3
1950  'hr' => $merge( $common, [ 'width' ] ),
1951 
1952  # HTML Ruby annotation text module, simple ruby only.
1953  # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1954  'ruby' => $common,
1955  # rbc
1956  'rb' => $common,
1957  'rp' => $common,
1958  'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1959  'rtc' => $common,
1960 
1961  # MathML root element, where used for extensions
1962  # 'title' may not be 100% valid here; it's XHTML
1963  # https://www.w3.org/TR/REC-MathML/
1964  'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1965 
1966  // HTML 5 section 4.5
1967  'figure' => $common,
1968  'figure-inline' => $common, # T118520
1969  'figcaption' => $common,
1970 
1971  # HTML 5 section 4.6
1972  'bdi' => $common,
1973 
1974  # HTML5 elements, defined by:
1975  # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1976  'data' => $merge( $common, [ 'value' ] ),
1977  'time' => $merge( $common, [ 'datetime' ] ),
1978  'mark' => $common,
1979 
1980  // meta and link are only permitted by removeHTMLtags when Microdata
1981  // is enabled so we don't bother adding a conditional to hide these
1982  // Also meta and link are only valid in WikiText as Microdata elements
1983  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1984  // So we don't bother including $common attributes that have no purpose.
1985  'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1986  'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1987  ];
1988 
1989  return $whitelist;
1990  }
1991 
2003  static function stripAllTags( $html ) {
2004  // Use RemexHtml to tokenize $html and extract the text
2005  $handler = new RemexStripTagHandler;
2006  $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
2007  'ignoreErrors' => true,
2008  // don't ignore char refs, we want them to be decoded
2009  'ignoreNulls' => true,
2010  'skipPreprocess' => true,
2011  ] );
2012  $tokenizer->execute();
2013  $text = $handler->getResult();
2014 
2015  $text = self::normalizeWhitespace( $text );
2016  return $text;
2017  }
2018 
2028  static function hackDocType() {
2029  $out = "<!DOCTYPE html [\n";
2030  foreach ( self::HTML_ENTITIES as $entity => $codepoint ) {
2031  $out .= "<!ENTITY $entity \"&#$codepoint;\">";
2032  }
2033  $out .= "]>\n";
2034  return $out;
2035  }
2036 
2041  static function cleanUrl( $url ) {
2042  # Normalize any HTML entities in input. They will be
2043  # re-escaped by makeExternalLink().
2044  $url = self::decodeCharReferences( $url );
2045 
2046  # Escape any control characters introduced by the above step
2047  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
2048  [ __CLASS__, 'cleanUrlCallback' ], $url );
2049 
2050  # Validate hostname portion
2051  $matches = [];
2052  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
2053  list( /* $whole */, $protocol, $host, $rest ) = $matches;
2054 
2055  // Characters that will be ignored in IDNs.
2056  // https://tools.ietf.org/html/rfc3454#section-3.1
2057  // Strip them before further processing so blacklists and such work.
2058  $strip = "/
2059  \\s| # general whitespace
2060  \xc2\xad| # 00ad SOFT HYPHEN
2061  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
2062  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
2063  \xe2\x81\xa0| # 2060 WORD JOINER
2064  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
2065  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
2066  \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
2067  \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
2068  \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
2069  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
2070  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
2071  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
2072  /xuD";
2073 
2074  $host = preg_replace( $strip, '', $host );
2075 
2076  // IPv6 host names are bracketed with []. Url-decode these.
2077  if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 &&
2078  preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
2079  ) {
2080  $host = '//[' . $matches[1] . ']' . $matches[2];
2081  }
2082 
2083  // @todo FIXME: Validate hostnames here
2084 
2085  return $protocol . $host . $rest;
2086  } else {
2087  return $url;
2088  }
2089  }
2090 
2095  static function cleanUrlCallback( $matches ) {
2096  return urlencode( $matches[0] );
2097  }
2098 
2127  public static function validateEmail( $addr ) {
2128  $result = null;
2129  if ( !Hooks::run( 'isValidEmailAddr', [ $addr, &$result ] ) ) {
2130  return $result;
2131  }
2132 
2133  // Please note strings below are enclosed in brackets [], this make the
2134  // hyphen "-" a range indicator. Hence it is double backslashed below.
2135  // See T28948
2136  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
2137  $rfc1034_ldh_str = "a-z0-9\\-";
2138 
2139  $html5_email_regexp = "/
2140  ^ # start of string
2141  [$rfc5322_atext\\.]+ # user part which is liberal :p
2142  @ # 'apostrophe'
2143  [$rfc1034_ldh_str]+ # First domain part
2144  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
2145  $ # End of string
2146  /ix"; // case Insensitive, eXtended
2147 
2148  return (bool)preg_match( $html5_email_regexp, $addr );
2149  }
2150 }
$wgExternalInterwikiFragmentMode
Which ID escaping mode should be used for external interwiki links? See documentation for $wgFragment...
static armorFrenchSpaces( $text, $space='&#160;')
Armor French spaces with a replacement character.
Definition: Sanitizer.php:1141
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
static escapeId( $id, $options=[])
Given a value, escape it so that it can be used in an id attribute and return it. ...
Definition: Sanitizer.php:1223
static normalizeWhitespace( $text)
Definition: Sanitizer.php:1497
static attributeWhitelistInternal( $element)
Fetch the whitelist of acceptable attributes for a given element name.
Definition: Sanitizer.php:1734
const HTML_ENTITIES
List of all named character entities defined in HTML 4.01 https://www.w3.org/TR/html4/sgml/entities.html As well as ' which is only defined starting in XHTML1.
Definition: Sanitizer.php:81
const const HTML_ENTITY_ALIASES
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:340
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki&#39;s primary encoding.
Definition: Sanitizer.php:66
static setupAttributeWhitelist()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
Definition: Sanitizer.php:1745
static cleanUrl( $url)
Definition: Sanitizer.php:2041
static isEnabled()
Definition: MWTidy.php:54
static escapeIdReferenceList( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
Definition: Sanitizer.php:1353
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
Definition: Sanitizer.php:915
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
Definition: Sanitizer.php:957
static $attribNameRegex
Lazy-initialised attribute name regex, see getAttribNameRegex()
Definition: Sanitizer.php:379
static hackDocType()
Hack up a private DOCTYPE with HTML&#39;s standard entity declarations.
Definition: Sanitizer.php:2028
static hexCharReference( $codepoint)
Definition: Sanitizer.php:1597
static attributeWhitelist( $element)
Fetch the whitelist of acceptable attributes for a given element name.
Definition: Sanitizer.php:1721
static escapeIdInternal( $id, $mode)
Helper for escapeIdFor*() functions.
Definition: Sanitizer.php:1322
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
Definition: Sanitizer.php:74
static validateEmail( $addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:2127
static decCharReference( $codepoint)
Definition: Sanitizer.php:1584
const EVIL_URI_PATTERN
Blacklist for evil uris like javascript: WARNING: DO NOT use this in any place that actually requires...
Definition: Sanitizer.php:58
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:791
if( $line===false) $args
Definition: mcc.php:124
const ELEMENT_BITS_REGEX
Acceptable tag name charset from HTML5 parsing spec https://www.w3.org/TR/html5/syntax.html#tag-open-state.
Definition: Sanitizer.php:48
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed, encoded as plain text.
Definition: Sanitizer.php:2003
static cssDecodeCallback( $matches)
Definition: Sanitizer.php:1056
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
Definition: Sanitizer.php:400
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
Definition: Sanitizer.php:1257
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:936
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:38
static getTagAttributeCallback( $set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
Definition: Sanitizer.php:1473
static decodeChar( $codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1687
$wgFragmentMode
How should section IDs be encoded? This array can contain 1 or 2 elements, each of them can be one of...
static cleanUrlCallback( $matches)
Definition: Sanitizer.php:2095
static decodeTagAttributes( $text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:1412
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:1380
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1531
static validateTag( $params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
Definition: Sanitizer.php:755
const XMLNS_ATTRIBUTE_PATTERN
Definition: Sanitizer.php:59
static safeEncodeTagAttributes( $assoc_array)
Build a partial tag string from an associative array of attribute names and values as returned by dec...
Definition: Sanitizer.php:1454
static validateCodepoint( $codepoint)
Returns true if a given Unicode codepoint is a valid character in both HTML5 and XML.
Definition: Sanitizer.php:1612
static normalizeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1542
static decodeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1668
$wgAllowImageTag
A different approach to the above: simply allow the "<img>" tag to be used.
static encodeAttribute( $text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:1118
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:1284
static removeHTMLcomments( $text)
Remove &#39;&#39;, and everything between.
Definition: Sanitizer.php:709
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Throws a warning that $function is deprecated.
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string...
Definition: Sanitizer.php:1631
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:1307
static normalizeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the equivalent numeric entity re...
Definition: Sanitizer.php:1568
const const static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
Definition: Sanitizer.php:342
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:1027
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element&#39;s attributes and normalize it to well-formed XML...
Definition: Sanitizer.php:1098
static decodeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:1703
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:356
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1395
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[], $warnCallback=null)
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments. ...
Definition: Sanitizer.php:497
static validateAttributes( $attribs, $whitelist)
Take an array of attribute names and values and normalize or discard illegal values for the given whi...
Definition: Sanitizer.php:813
static getAttribNameRegex()
Used in Sanitizer::decodeTagAttributes to filter attributes.
Definition: Sanitizer.php:385
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(), for use in the id&#39;s that are used for section links.
Definition: Sanitizer.php:1512
static setupAttributeWhitelistInternal()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
Definition: Sanitizer.php:1762
static run( $event, array $args=[], $deprecatedVersion=null)
Call hook functions defined in Hooks::register and $wgHooks.
Definition: Hooks.php:200
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1648
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing...
Definition: Sanitizer.php:1161
$matches