MediaWiki  master
Sanitizer.php
Go to the documentation of this file.
1 <?php
28 
33 class Sanitizer {
38  private const CHAR_REFS_REGEX =
39  '/&([A-Za-z0-9\x80-\xff]+);
40  |&\#([0-9]+);
41  |&\#[xX]([0-9A-Fa-f]+);
42  |(&)/x';
43 
48  private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
49 
59  private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
60  private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
61 
67  public const ID_PRIMARY = 0;
68 
75  public const ID_FALLBACK = 1;
76 
82  private const HTML_ENTITIES = [
83  'Aacute' => 193,
84  'aacute' => 225,
85  'Acirc' => 194,
86  'acirc' => 226,
87  'acute' => 180,
88  'AElig' => 198,
89  'aelig' => 230,
90  'Agrave' => 192,
91  'agrave' => 224,
92  'alefsym' => 8501,
93  'Alpha' => 913,
94  'alpha' => 945,
95  'amp' => 38,
96  'and' => 8743,
97  'ang' => 8736,
98  'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
99  'Aring' => 197,
100  'aring' => 229,
101  'asymp' => 8776,
102  'Atilde' => 195,
103  'atilde' => 227,
104  'Auml' => 196,
105  'auml' => 228,
106  'bdquo' => 8222,
107  'Beta' => 914,
108  'beta' => 946,
109  'brvbar' => 166,
110  'bull' => 8226,
111  'cap' => 8745,
112  'Ccedil' => 199,
113  'ccedil' => 231,
114  'cedil' => 184,
115  'cent' => 162,
116  'Chi' => 935,
117  'chi' => 967,
118  'circ' => 710,
119  'clubs' => 9827,
120  'cong' => 8773,
121  'copy' => 169,
122  'crarr' => 8629,
123  'cup' => 8746,
124  'curren' => 164,
125  'dagger' => 8224,
126  'Dagger' => 8225,
127  'darr' => 8595,
128  'dArr' => 8659,
129  'deg' => 176,
130  'Delta' => 916,
131  'delta' => 948,
132  'diams' => 9830,
133  'divide' => 247,
134  'Eacute' => 201,
135  'eacute' => 233,
136  'Ecirc' => 202,
137  'ecirc' => 234,
138  'Egrave' => 200,
139  'egrave' => 232,
140  'empty' => 8709,
141  'emsp' => 8195,
142  'ensp' => 8194,
143  'Epsilon' => 917,
144  'epsilon' => 949,
145  'equiv' => 8801,
146  'Eta' => 919,
147  'eta' => 951,
148  'ETH' => 208,
149  'eth' => 240,
150  'Euml' => 203,
151  'euml' => 235,
152  'euro' => 8364,
153  'exist' => 8707,
154  'fnof' => 402,
155  'forall' => 8704,
156  'frac12' => 189,
157  'frac14' => 188,
158  'frac34' => 190,
159  'frasl' => 8260,
160  'Gamma' => 915,
161  'gamma' => 947,
162  'ge' => 8805,
163  'gt' => 62,
164  'harr' => 8596,
165  'hArr' => 8660,
166  'hearts' => 9829,
167  'hellip' => 8230,
168  'Iacute' => 205,
169  'iacute' => 237,
170  'Icirc' => 206,
171  'icirc' => 238,
172  'iexcl' => 161,
173  'Igrave' => 204,
174  'igrave' => 236,
175  'image' => 8465,
176  'infin' => 8734,
177  'int' => 8747,
178  'Iota' => 921,
179  'iota' => 953,
180  'iquest' => 191,
181  'isin' => 8712,
182  'Iuml' => 207,
183  'iuml' => 239,
184  'Kappa' => 922,
185  'kappa' => 954,
186  'Lambda' => 923,
187  'lambda' => 955,
188  'lang' => 9001,
189  'laquo' => 171,
190  'larr' => 8592,
191  'lArr' => 8656,
192  'lceil' => 8968,
193  'ldquo' => 8220,
194  'le' => 8804,
195  'lfloor' => 8970,
196  'lowast' => 8727,
197  'loz' => 9674,
198  'lrm' => 8206,
199  'lsaquo' => 8249,
200  'lsquo' => 8216,
201  'lt' => 60,
202  'macr' => 175,
203  'mdash' => 8212,
204  'micro' => 181,
205  'middot' => 183,
206  'minus' => 8722,
207  'Mu' => 924,
208  'mu' => 956,
209  'nabla' => 8711,
210  'nbsp' => 160,
211  'ndash' => 8211,
212  'ne' => 8800,
213  'ni' => 8715,
214  'not' => 172,
215  'notin' => 8713,
216  'nsub' => 8836,
217  'Ntilde' => 209,
218  'ntilde' => 241,
219  'Nu' => 925,
220  'nu' => 957,
221  'Oacute' => 211,
222  'oacute' => 243,
223  'Ocirc' => 212,
224  'ocirc' => 244,
225  'OElig' => 338,
226  'oelig' => 339,
227  'Ograve' => 210,
228  'ograve' => 242,
229  'oline' => 8254,
230  'Omega' => 937,
231  'omega' => 969,
232  'Omicron' => 927,
233  'omicron' => 959,
234  'oplus' => 8853,
235  'or' => 8744,
236  'ordf' => 170,
237  'ordm' => 186,
238  'Oslash' => 216,
239  'oslash' => 248,
240  'Otilde' => 213,
241  'otilde' => 245,
242  'otimes' => 8855,
243  'Ouml' => 214,
244  'ouml' => 246,
245  'para' => 182,
246  'part' => 8706,
247  'permil' => 8240,
248  'perp' => 8869,
249  'Phi' => 934,
250  'phi' => 966,
251  'Pi' => 928,
252  'pi' => 960,
253  'piv' => 982,
254  'plusmn' => 177,
255  'pound' => 163,
256  'prime' => 8242,
257  'Prime' => 8243,
258  'prod' => 8719,
259  'prop' => 8733,
260  'Psi' => 936,
261  'psi' => 968,
262  'quot' => 34,
263  'radic' => 8730,
264  'rang' => 9002,
265  'raquo' => 187,
266  'rarr' => 8594,
267  'rArr' => 8658,
268  'rceil' => 8969,
269  'rdquo' => 8221,
270  'real' => 8476,
271  'reg' => 174,
272  'rfloor' => 8971,
273  'Rho' => 929,
274  'rho' => 961,
275  'rlm' => 8207,
276  'rsaquo' => 8250,
277  'rsquo' => 8217,
278  'sbquo' => 8218,
279  'Scaron' => 352,
280  'scaron' => 353,
281  'sdot' => 8901,
282  'sect' => 167,
283  'shy' => 173,
284  'Sigma' => 931,
285  'sigma' => 963,
286  'sigmaf' => 962,
287  'sim' => 8764,
288  'spades' => 9824,
289  'sub' => 8834,
290  'sube' => 8838,
291  'sum' => 8721,
292  'sup' => 8835,
293  'sup1' => 185,
294  'sup2' => 178,
295  'sup3' => 179,
296  'supe' => 8839,
297  'szlig' => 223,
298  'Tau' => 932,
299  'tau' => 964,
300  'there4' => 8756,
301  'Theta' => 920,
302  'theta' => 952,
303  'thetasym' => 977,
304  'thinsp' => 8201,
305  'THORN' => 222,
306  'thorn' => 254,
307  'tilde' => 732,
308  'times' => 215,
309  'trade' => 8482,
310  'Uacute' => 218,
311  'uacute' => 250,
312  'uarr' => 8593,
313  'uArr' => 8657,
314  'Ucirc' => 219,
315  'ucirc' => 251,
316  'Ugrave' => 217,
317  'ugrave' => 249,
318  'uml' => 168,
319  'upsih' => 978,
320  'Upsilon' => 933,
321  'upsilon' => 965,
322  'Uuml' => 220,
323  'uuml' => 252,
324  'weierp' => 8472,
325  'Xi' => 926,
326  'xi' => 958,
327  'Yacute' => 221,
328  'yacute' => 253,
329  'yen' => 165,
330  'Yuml' => 376,
331  'yuml' => 255,
332  'Zeta' => 918,
333  'zeta' => 950,
334  'zwj' => 8205,
335  'zwnj' => 8204
336  ];
337 
341  private const HTML_ENTITY_ALIASES = [
342  'רלמ' => 'rlm',
343  'رلم' => 'rlm',
344  ];
345 
349  private static $attribsRegex;
350 
357  private static function getAttribsRegex() {
358  if ( self::$attribsRegex === null ) {
359  $spaceChars = '\x09\x0a\x0c\x0d\x20';
360  $space = "[{$spaceChars}]";
361  $attrib = "[^{$spaceChars}\/>=]";
362  $attribFirst = "(?:{$attrib}|=)";
363  self::$attribsRegex =
364  "/({$attribFirst}{$attrib}*)
365  ($space*=$space*
366  (?:
367  # The attribute value: quoted or alone
368  \"([^\"]*)(?:\"|\$)
369  | '([^']*)(?:'|\$)
370  | (((?!$space|>).)*)
371  )
372  )?/sxu";
373  }
374  return self::$attribsRegex;
375  }
376 
380  private static $attribNameRegex;
381 
386  private static function getAttribNameRegex() {
387  if ( self::$attribNameRegex === null ) {
388  $attribFirst = "[:_\p{L}\p{N}]";
389  $attrib = "[:_\.\-\p{L}\p{N}]";
390  self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
391  }
392  return self::$attribNameRegex;
393  }
394 
401  public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
402  global $wgAllowImageTag;
403 
404  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
405  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
406 
407  // Base our staticInitialised variable off of the global config state so that if the globals
408  // are changed (like in the screwed up test system) we will re-initialise the settings.
409  $globalContext = $wgAllowImageTag;
410  if ( !$staticInitialised || $staticInitialised != $globalContext ) {
411  $htmlpairsStatic = [ # Tags that must be closed
412  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
413  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
414  'strike', 'strong', 'tt', 'var', 'div', 'center',
415  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
416  'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
417  'kbd', 'samp', 'data', 'time', 'mark'
418  ];
419  $htmlsingle = [
420  'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
421  ];
422 
423  # Elements that cannot have close tags. This is (not coincidentally)
424  # also the list of tags for which the HTML 5 parsing algorithm
425  # requires you to "acknowledge the token's self-closing flag", i.e.
426  # a self-closing tag like <br/> is not an HTML 5 parse error only
427  # for this list.
428  $htmlsingleonly = [
429  'br', 'wbr', 'hr', 'meta', 'link'
430  ];
431 
432  $htmlnest = [ # Tags that can be nested--??
433  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
434  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
435  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
436  ];
437  $tabletags = [ # Can only appear inside table, we will close them
438  'td', 'th', 'tr',
439  ];
440  $htmllist = [ # Tags used by list
441  'ul', 'ol',
442  ];
443  $listtags = [ # Tags that can appear in a list
444  'li',
445  ];
446 
447  if ( $wgAllowImageTag ) {
448  wfDeprecatedMsg( 'Setting $wgAllowImageTag to true ' .
449  'is deprecated since MediaWiki 1.35', '1.35', false, false );
450  $htmlsingle[] = 'img';
451  $htmlsingleonly[] = 'img';
452  }
453 
454  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
455  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
456 
457  # Convert them all to hashtables for faster lookup
458  $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
459  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
460  foreach ( $vars as $var ) {
461  $$var = array_flip( $$var );
462  }
463  $staticInitialised = $globalContext;
464  }
465 
466  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
467  $extratags = array_flip( $extratags );
468  $removetags = array_flip( $removetags );
469  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
470  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
471 
472  return [
473  'htmlpairs' => $htmlpairs,
474  'htmlsingle' => $htmlsingle,
475  'htmlsingleonly' => $htmlsingleonly,
476  'htmlnest' => $htmlnest,
477  'tabletags' => $tabletags,
478  'htmllist' => $htmllist,
479  'listtags' => $listtags,
480  'htmlsingleallowed' => $htmlsingleallowed,
481  'htmlelements' => $htmlelements,
482  ];
483  }
484 
496  public static function removeHTMLtags( $text, $processCallback = null,
497  $args = [], $extratags = [], $removetags = []
498  ) {
499  $tagData = self::getRecognizedTagData( $extratags, $removetags );
500  $htmlpairs = $tagData['htmlpairs'];
501  $htmlsingle = $tagData['htmlsingle'];
502  $htmlsingleonly = $tagData['htmlsingleonly'];
503  $htmlnest = $tagData['htmlnest'];
504  $tabletags = $tagData['tabletags'];
505  $htmllist = $tagData['htmllist'];
506  $listtags = $tagData['listtags'];
507  $htmlsingleallowed = $tagData['htmlsingleallowed'];
508  $htmlelements = $tagData['htmlelements'];
509 
510  # Remove HTML comments
511  $text = self::removeHTMLcomments( $text );
512  $bits = explode( '<', $text );
513  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
514 
515  # this might be possible using remex tidy itself
516  foreach ( $bits as $x ) {
517  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
518  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
519 
520  $badtag = false;
521  $t = strtolower( $t );
522  if ( isset( $htmlelements[$t] ) ) {
523  if ( is_callable( $processCallback ) ) {
524  call_user_func_array( $processCallback, [ &$params, $args ] );
525  }
526 
527  if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
528  // Remove the self-closing slash, to be consistent
529  // with HTML5 semantics. T134423
530  $brace = '>';
531  }
532  if ( !self::validateTag( $params, $t ) ) {
533  $badtag = true;
534  }
535 
536  $newparams = self::fixTagAttributes( $params, $t );
537  if ( !$badtag ) {
538  if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
539  # Interpret self-closing tags as empty tags even when
540  # HTML 5 would interpret them as start tags. Such input
541  # is commonly seen on Wikimedia wikis with this intention.
542  $brace = "></$t>";
543  }
544 
545  $rest = str_replace( '>', '&gt;', $rest );
546  $text .= "<$slash$t$newparams$brace$rest";
547  continue;
548  }
549  }
550  }
551  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
552  }
553  return $text;
554  }
555 
565  public static function removeHTMLcomments( $text ) {
566  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
567  $end = strpos( $text, '-->', $start + 4 );
568  if ( $end === false ) {
569  # Unterminated comment; bail out
570  break;
571  }
572 
573  $end += 3;
574 
575  # Trim space and newline if the comment is both
576  # preceded and followed by a newline
577  $spaceStart = max( $start - 1, 0 );
578  $spaceLen = $end - $spaceStart;
579  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
580  $spaceStart--;
581  $spaceLen++;
582  }
583  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
584  $spaceLen++;
585  }
586  if ( substr( $text, $spaceStart, 1 ) === "\n"
587  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
588  # Remove the comment, leading and trailing
589  # spaces, and leave only one newline.
590  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
591  } else {
592  # Remove just the comment.
593  $text = substr_replace( $text, '', $start, $end - $start );
594  }
595  }
596  return $text;
597  }
598 
611  private static function validateTag( $params, $element ) {
612  $params = self::decodeTagAttributes( $params );
613 
614  if ( $element == 'meta' || $element == 'link' ) {
615  if ( !isset( $params['itemprop'] ) ) {
616  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
617  return false;
618  }
619  if ( $element == 'meta' && !isset( $params['content'] ) ) {
620  // <meta> must have a content="" for the itemprop
621  return false;
622  }
623  if ( $element == 'link' && !isset( $params['href'] ) ) {
624  // <link> must have an associated href=""
625  return false;
626  }
627  }
628 
629  return true;
630  }
631 
647  public static function validateTagAttributes( $attribs, $element ) {
648  return self::validateAttributes( $attribs,
649  self::attributesAllowedInternal( $element ) );
650  }
651 
670  public static function validateAttributes( $attribs, $allowed ) {
671  if ( isset( $allowed[0] ) ) {
672  // Calling this function with a sequential array is
673  // deprecated. For now just convert it.
674  wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
675  $allowed = array_flip( $allowed );
676  }
677  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
678 
679  $out = [];
680  foreach ( $attribs as $attribute => $value ) {
681  # Allow XML namespace declaration to allow RDFa
682  if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
683  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
684  $out[$attribute] = $value;
685  }
686 
687  continue;
688  }
689 
690  # Allow any attribute beginning with "data-"
691  # However:
692  # * Disallow data attributes used by MediaWiki code
693  # * Ensure that the attribute is not namespaced by banning
694  # colons.
695  if ( (
696  !preg_match( '/^data-[^:]*$/i', $attribute ) &&
697  !array_key_exists( $attribute, $allowed )
698  ) || self::isReservedDataAttribute( $attribute ) ) {
699  continue;
700  }
701 
702  # Strip javascript "expression" from stylesheets.
703  # https://msdn.microsoft.com/en-us/library/ms537634.aspx
704  if ( $attribute == 'style' ) {
705  $value = self::checkCss( $value );
706  }
707 
708  # Escape HTML id attributes
709  if ( $attribute === 'id' ) {
710  $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
711  }
712 
713  # Escape HTML id reference lists
714  if ( $attribute === 'aria-describedby'
715  || $attribute === 'aria-flowto'
716  || $attribute === 'aria-labelledby'
717  || $attribute === 'aria-owns'
718  ) {
719  $value = self::escapeIdReferenceList( $value );
720  }
721 
722  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
723  // Check them for sanity.
724  if ( $attribute === 'rel' || $attribute === 'rev'
725  # RDFa
726  || $attribute === 'about' || $attribute === 'property'
727  || $attribute === 'resource' || $attribute === 'datatype'
728  || $attribute === 'typeof'
729  # HTML5 microdata
730  || $attribute === 'itemid' || $attribute === 'itemprop'
731  || $attribute === 'itemref' || $attribute === 'itemscope'
732  || $attribute === 'itemtype'
733  ) {
734  // Paranoia. Allow "simple" values but suppress javascript
735  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
736  continue;
737  }
738  }
739 
740  # NOTE: even though elements using href/src are not allowed directly, supply
741  # validation code that can be used by tag hook handlers, etc
742  if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
743  if ( !preg_match( $hrefExp, $value ) ) {
744  continue; // drop any href or src attributes not using an allowed protocol.
745  // NOTE: this also drops all relative URLs
746  }
747  }
748 
749  if ( $attribute === 'tabindex' && $value !== '0' ) {
750  // Only allow tabindex of 0, which is useful for accessibility.
751  continue;
752  }
753 
754  // If this attribute was previously set, override it.
755  // Output should only have one attribute of each name.
756  $out[$attribute] = $value;
757  }
758 
759  # itemtype, itemid, itemref don't make sense without itemscope
760  if ( !array_key_exists( 'itemscope', $out ) ) {
761  unset( $out['itemtype'] );
762  unset( $out['itemid'] );
763  unset( $out['itemref'] );
764  }
765  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
766 
767  return $out;
768  }
769 
777  public static function isReservedDataAttribute( $attr ) {
778  // data-ooui is reserved for ooui.
779  // data-mw and data-parsoid are reserved for parsoid.
780  // data-mw-<name here> is reserved for extensions (or core) if
781  // they need to communicate some data to the client and want to be
782  // sure that it isn't coming from an untrusted user.
783  // We ignore the possibility of namespaces since user-generated HTML
784  // can't use them anymore.
785  return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
786  }
787 
798  public static function mergeAttributes( $a, $b ) {
799  $out = array_merge( $a, $b );
800  if ( isset( $a['class'] ) && isset( $b['class'] )
801  && is_string( $a['class'] ) && is_string( $b['class'] )
802  && $a['class'] !== $b['class']
803  ) {
804  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
805  -1, PREG_SPLIT_NO_EMPTY );
806  $out['class'] = implode( ' ', array_unique( $classes ) );
807  }
808  return $out;
809  }
810 
819  public static function normalizeCss( $value ) {
820  // Decode character references like &#123;
821  $value = self::decodeCharReferences( $value );
822 
823  // Decode escape sequences and line continuation
824  // See the grammar in the CSS 2 spec, appendix D.
825  // This has to be done AFTER decoding character references.
826  // This means it isn't possible for this function to return
827  // unsanitized escape sequences. It is possible to manufacture
828  // input that contains character references that decode to
829  // escape sequences that decode to character references, but
830  // it's OK for the return value to contain character references
831  // because the caller is supposed to escape those anyway.
832  static $decodeRegex;
833  if ( !$decodeRegex ) {
834  $space = '[\\x20\\t\\r\\n\\f]';
835  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
836  $backslash = '\\\\';
837  $decodeRegex = "/ $backslash
838  (?:
839  ($nl) | # 1. Line continuation
840  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
841  (.) | # 3. backslash cancelling special meaning
842  () | # 4. backslash at end of string
843  )/xu";
844  }
845  $value = preg_replace_callback( $decodeRegex,
846  [ __CLASS__, 'cssDecodeCallback' ], $value );
847 
848  // Let the value through if it's nothing but a single comment, to
849  // allow other functions which may reject it to pass some error
850  // message through.
851  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
852  // Remove any comments; IE gets token splitting wrong
853  // This must be done AFTER decoding character references and
854  // escape sequences, because those steps can introduce comments
855  // This step cannot introduce character references or escape
856  // sequences, because it replaces comments with spaces rather
857  // than removing them completely.
858  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
859 
860  // Remove anything after a comment-start token, to guard against
861  // incorrect client implementations.
862  $commentPos = strpos( $value, '/*' );
863  if ( $commentPos !== false ) {
864  $value = substr( $value, 0, $commentPos );
865  }
866  }
867 
868  return $value;
869  }
870 
889  public static function checkCss( $value ) {
890  $value = self::normalizeCss( $value );
891 
892  // Reject problematic keywords and control characters
893  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
894  strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
895  return '/* invalid control char */';
896  } elseif ( preg_match(
897  '! expression
898  | filter\s*:
899  | accelerator\s*:
900  | -o-link\s*:
901  | -o-link-source\s*:
902  | -o-replace\s*:
903  | url\s*\‍(
904  | image\s*\‍(
905  | image-set\s*\‍(
906  | attr\s*\‍([^)]+[\s,]+url
907  | var\s*\‍(
908  !ix', $value ) ) {
909  return '/* insecure input */';
910  }
911  return $value;
912  }
913 
918  private static function cssDecodeCallback( $matches ) {
919  if ( $matches[1] !== '' ) {
920  // Line continuation
921  return '';
922  } elseif ( $matches[2] !== '' ) {
923  $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
924  } elseif ( $matches[3] !== '' ) {
925  $char = $matches[3];
926  } else {
927  $char = '\\';
928  }
929  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
930  // These characters need to be escaped in strings
931  // Clean up the escape sequence to avoid parsing errors by clients
932  return '\\' . dechex( ord( $char ) ) . ' ';
933  } else {
934  // Decode unnecessary escape
935  return $char;
936  }
937  }
938 
960  public static function fixTagAttributes( $text, $element, $sorted = false ) {
961  if ( trim( $text ) == '' ) {
962  return '';
963  }
964 
965  $decoded = self::decodeTagAttributes( $text );
966  $stripped = self::validateTagAttributes( $decoded, $element );
967 
968  if ( $sorted ) {
969  ksort( $stripped );
970  }
971 
972  return self::safeEncodeTagAttributes( $stripped );
973  }
974 
980  public static function encodeAttribute( $text ) {
981  $encValue = htmlspecialchars( $text, ENT_QUOTES );
982 
983  // Whitespace is normalized during attribute decoding,
984  // so if we've been passed non-spaces we must encode them
985  // ahead of time or they won't be preserved.
986  $encValue = strtr( $encValue, [
987  "\n" => '&#10;',
988  "\r" => '&#13;',
989  "\t" => '&#9;',
990  ] );
991 
992  return $encValue;
993  }
994 
1003  public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
1004  // Replace $ with \$ and \ with \\
1005  $space = preg_replace( '#(?<!\\\\‍)(\\$|\\\\‍)#', '\\\\$1', $space );
1006  $fixtags = [
1007  # French spaces, last one Guillemet-left
1008  # only if there is something before the space
1009  # and a non-word character after the punctuation.
1010  '/(?<=\S) (?=[?:;!%»›](?!\w))/u' => "$space",
1011  # French spaces, Guillemet-right
1012  '/([«‹]) /u' => "\\1$space",
1013  ];
1014  return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
1015  }
1016 
1023  public static function safeEncodeAttribute( $text ) {
1024  $encValue = self::encodeAttribute( $text );
1025 
1026  # Templates and links may be expanded in later parsing,
1027  # creating invalid or dangerous output. Suppress this.
1028  $encValue = strtr( $encValue, [
1029  '<' => '&lt;', // This should never happen,
1030  '>' => '&gt;', // we've received invalid input
1031  '"' => '&quot;', // which should have been escaped.
1032  '{' => '&#123;',
1033  '}' => '&#125;', // prevent unpaired language conversion syntax
1034  '[' => '&#91;',
1035  ']' => '&#93;',
1036  "''" => '&#39;&#39;',
1037  'ISBN' => '&#73;SBN',
1038  'RFC' => '&#82;FC',
1039  'PMID' => '&#80;MID',
1040  '|' => '&#124;',
1041  '__' => '&#95;_',
1042  ] );
1043 
1044  # Armor against French spaces detection (T5158)
1045  $encValue = self::armorFrenchSpaces( $encValue, '&#32;' );
1046 
1047  # Stupid hack
1048  $encValue = preg_replace_callback(
1049  '/((?i)' . wfUrlProtocols() . ')/',
1050  function ( $matches ) {
1051  return str_replace( ':', '&#58;', $matches[1] );
1052  },
1053  $encValue );
1054  return $encValue;
1055  }
1056 
1085  public static function escapeId( $id, $options = [] ) {
1086  wfDeprecated( __METHOD__, '1.30' );
1087  $options = (array)$options;
1088 
1089  // HTML4-style escaping
1090  static $replace = [
1091  '%3A' => ':',
1092  '%' => '.'
1093  ];
1094 
1095  $id = urlencode( strtr( $id, ' ', '_' ) );
1096  $id = strtr( $id, $replace );
1097 
1098  if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) {
1099  // Initial character must be a letter!
1100  $id = "x$id";
1101  }
1102  return $id;
1103  }
1104 
1120  public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
1121  global $wgFragmentMode;
1122 
1123  if ( !isset( $wgFragmentMode[$mode] ) ) {
1124  if ( $mode === self::ID_PRIMARY ) {
1125  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
1126  }
1127  return false;
1128  }
1129 
1130  $internalMode = $wgFragmentMode[$mode];
1131 
1132  return self::escapeIdInternal( $id, $internalMode );
1133  }
1134 
1147  public static function escapeIdForLink( $id ) {
1148  global $wgFragmentMode;
1149 
1150  if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
1151  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
1152  }
1153 
1155 
1156  $id = self::escapeIdInternalUrl( $id, $mode );
1157 
1158  return $id;
1159  }
1160 
1170  public static function escapeIdForExternalInterwiki( $id ) {
1172 
1174 
1175  return $id;
1176  }
1177 
1187  private static function escapeIdInternalUrl( $id, $mode ) {
1188  $id = self::escapeIdInternal( $id, $mode );
1189  if ( $mode === 'html5' ) {
1190  $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
1191  }
1192  return $id;
1193  }
1194 
1202  private static function escapeIdInternal( $id, $mode ) {
1203  switch ( $mode ) {
1204  case 'html5':
1205  // html5 spec says ids must not have any of the following:
1206  // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
1207  // In practice, in wikitext, only tab, LF, CR (and SPACE) are
1208  // possible using either Lua or html entities.
1209  $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
1210  break;
1211  case 'legacy':
1212  // This corresponds to 'noninitial' mode of the old escapeId()
1213  static $replace = [
1214  '%3A' => ':',
1215  '%' => '.'
1216  ];
1217 
1218  $id = urlencode( str_replace( ' ', '_', $id ) );
1219  $id = strtr( $id, $replace );
1220  break;
1221  default:
1222  throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1223  }
1224 
1225  return $id;
1226  }
1227 
1237  public static function escapeIdReferenceList( $referenceString ) {
1238  # Explode the space delimited list string into an array of tokens
1239  $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1240 
1241  # Escape each token as an id
1242  foreach ( $references as &$ref ) {
1243  $ref = self::escapeIdForAttribute( $ref );
1244  }
1245 
1246  # Merge the array back to a space delimited list string
1247  # If the array is empty, the result will be an empty string ('')
1248  $referenceString = implode( ' ', $references );
1249 
1250  return $referenceString;
1251  }
1252 
1264  public static function escapeClass( $class ) {
1265  // Convert ugly stuff to underscores and kill underscores in ugly places
1266  return rtrim( preg_replace(
1267  [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1268  '_',
1269  $class ), '_' );
1270  }
1271 
1279  public static function escapeHtmlAllowEntities( $html ) {
1280  $html = self::decodeCharReferences( $html );
1281  # It seems wise to escape ' as well as ", as a matter of course. Can't
1282  # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1283  # don't cause the entire string to disappear.
1284  $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1285  return $html;
1286  }
1287 
1296  public static function decodeTagAttributes( $text ) {
1297  if ( trim( $text ) == '' ) {
1298  return [];
1299  }
1300 
1301  $pairs = [];
1302  if ( !preg_match_all(
1303  self::getAttribsRegex(),
1304  $text,
1305  $pairs,
1306  PREG_SET_ORDER ) ) {
1307  return [];
1308  }
1309 
1310  $attribs = [];
1311  foreach ( $pairs as $set ) {
1312  $attribute = strtolower( $set[1] );
1313 
1314  // Filter attribute names with unacceptable characters
1315  if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1316  continue;
1317  }
1318 
1319  $value = self::getTagAttributeCallback( $set );
1320 
1321  // Normalize whitespace
1322  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1323  $value = trim( $value );
1324 
1325  // Decode character references
1326  $attribs[$attribute] = self::decodeCharReferences( $value );
1327  }
1328  return $attribs;
1329  }
1330 
1338  public static function safeEncodeTagAttributes( $assoc_array ) {
1339  $attribs = [];
1340  foreach ( $assoc_array as $attribute => $value ) {
1341  $encAttribute = htmlspecialchars( $attribute );
1342  $encValue = self::safeEncodeAttribute( $value );
1343 
1344  $attribs[] = "$encAttribute=\"$encValue\"";
1345  }
1346  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1347  }
1348 
1357  private static function getTagAttributeCallback( $set ) {
1358  if ( isset( $set[5] ) ) {
1359  # No quotes.
1360  return $set[5];
1361  } elseif ( isset( $set[4] ) ) {
1362  # Single-quoted
1363  return $set[4];
1364  } elseif ( isset( $set[3] ) ) {
1365  # Double-quoted
1366  return $set[3];
1367  } elseif ( !isset( $set[2] ) ) {
1368  # In XHTML, attributes must have a value so return an empty string.
1369  # See "Empty attribute syntax",
1370  # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1371  return "";
1372  } else {
1373  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1374  }
1375  }
1376 
1381  private static function normalizeWhitespace( $text ) {
1382  return trim( preg_replace(
1383  '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1384  ' ',
1385  $text ) );
1386  }
1387 
1396  public static function normalizeSectionNameWhitespace( $section ) {
1397  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1398  }
1399 
1415  public static function normalizeCharReferences( $text ) {
1416  return preg_replace_callback(
1417  self::CHAR_REFS_REGEX,
1418  [ self::class, 'normalizeCharReferencesCallback' ],
1419  $text );
1420  }
1421 
1426  private static function normalizeCharReferencesCallback( $matches ) {
1427  $ret = null;
1428  if ( $matches[1] != '' ) {
1429  $ret = self::normalizeEntity( $matches[1] );
1430  } elseif ( $matches[2] != '' ) {
1431  $ret = self::decCharReference( $matches[2] );
1432  } elseif ( $matches[3] != '' ) {
1433  $ret = self::hexCharReference( $matches[3] );
1434  }
1435  if ( $ret === null ) {
1436  return htmlspecialchars( $matches[0] );
1437  } else {
1438  return $ret;
1439  }
1440  }
1441 
1452  private static function normalizeEntity( $name ) {
1453  if ( isset( self::HTML_ENTITY_ALIASES[$name] ) ) {
1454  return '&' . self::HTML_ENTITY_ALIASES[$name] . ';';
1455  } elseif ( in_array( $name, [ 'lt', 'gt', 'amp', 'quot' ] ) ) {
1456  return "&$name;";
1457  } elseif ( isset( self::HTML_ENTITIES[$name] ) ) {
1458  return '&#' . self::HTML_ENTITIES[$name] . ';';
1459  } else {
1460  return "&amp;$name;";
1461  }
1462  }
1463 
1468  private static function decCharReference( $codepoint ) {
1469  $point = intval( $codepoint );
1470  if ( self::validateCodepoint( $point ) ) {
1471  return sprintf( '&#%d;', $point );
1472  } else {
1473  return null;
1474  }
1475  }
1476 
1481  private static function hexCharReference( $codepoint ) {
1482  $point = hexdec( $codepoint );
1483  if ( self::validateCodepoint( $point ) ) {
1484  return sprintf( '&#x%x;', $point );
1485  } else {
1486  return null;
1487  }
1488  }
1489 
1496  private static function validateCodepoint( $codepoint ) {
1497  # U+000C is valid in HTML5 but not allowed in XML.
1498  # U+000D is valid in XML but not allowed in HTML5.
1499  # U+007F - U+009F are disallowed in HTML5 (control characters).
1500  return $codepoint == 0x09
1501  || $codepoint == 0x0a
1502  || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1503  || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1504  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1505  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1506  }
1507 
1515  public static function decodeCharReferences( $text ) {
1516  return preg_replace_callback(
1517  self::CHAR_REFS_REGEX,
1518  [ self::class, 'decodeCharReferencesCallback' ],
1519  $text );
1520  }
1521 
1532  public static function decodeCharReferencesAndNormalize( $text ) {
1533  $text = preg_replace_callback(
1534  self::CHAR_REFS_REGEX,
1535  [ self::class, 'decodeCharReferencesCallback' ],
1536  $text,
1537  -1, // limit
1538  $count
1539  );
1540 
1541  if ( $count ) {
1542  return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1543  } else {
1544  return $text;
1545  }
1546  }
1547 
1552  private static function decodeCharReferencesCallback( $matches ) {
1553  if ( $matches[1] != '' ) {
1554  return self::decodeEntity( $matches[1] );
1555  } elseif ( $matches[2] != '' ) {
1556  return self::decodeChar( intval( $matches[2] ) );
1557  } elseif ( $matches[3] != '' ) {
1558  return self::decodeChar( hexdec( $matches[3] ) );
1559  }
1560  # Last case should be an ampersand by itself
1561  return $matches[0];
1562  }
1563 
1571  private static function decodeChar( $codepoint ) {
1572  if ( self::validateCodepoint( $codepoint ) ) {
1573  return UtfNormal\Utils::codepointToUtf8( $codepoint );
1574  } else {
1575  return UtfNormal\Constants::UTF8_REPLACEMENT;
1576  }
1577  }
1578 
1587  private static function decodeEntity( $name ) {
1588  if ( isset( self::HTML_ENTITY_ALIASES[$name] ) ) {
1589  $name = self::HTML_ENTITY_ALIASES[$name];
1590  }
1591  if ( isset( self::HTML_ENTITIES[$name] ) ) {
1592  return UtfNormal\Utils::codepointToUtf8( self::HTML_ENTITIES[$name] );
1593  } else {
1594  return "&$name;";
1595  }
1596  }
1597 
1605  private static function attributesAllowedInternal( $element ) {
1607  return $list[$element] ?? [];
1608  }
1609 
1617  private static function setupAttributesAllowedInternal() {
1618  static $allowed;
1619 
1620  if ( $allowed !== null ) {
1621  return $allowed;
1622  }
1623 
1624  // For lookup efficiency flip each attributes array so the keys are
1625  // the valid attributes.
1626  $merge = function ( $a, $b, $c = [] ) {
1627  return array_merge( $a, array_flip( $b ), array_flip( $c ) );
1628  };
1629  $common = $merge( [], [
1630  # HTML
1631  'id',
1632  'class',
1633  'style',
1634  'lang',
1635  'dir',
1636  'title',
1637  'tabindex',
1638 
1639  # WAI-ARIA
1640  'aria-describedby',
1641  'aria-flowto',
1642  'aria-hidden',
1643  'aria-label',
1644  'aria-labelledby',
1645  'aria-owns',
1646  'role',
1647 
1648  # RDFa
1649  # These attributes are specified in section 9 of
1650  # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1651  'about',
1652  'property',
1653  'resource',
1654  'datatype',
1655  'typeof',
1656 
1657  # Microdata. These are specified by
1658  # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1659  'itemid',
1660  'itemprop',
1661  'itemref',
1662  'itemscope',
1663  'itemtype',
1664  ] );
1665 
1666  $block = $merge( $common, [ 'align' ] );
1667 
1668  $tablealign = [ 'align', 'valign' ];
1669  $tablecell = [
1670  'abbr',
1671  'axis',
1672  'headers',
1673  'scope',
1674  'rowspan',
1675  'colspan',
1676  'nowrap', # deprecated
1677  'width', # deprecated
1678  'height', # deprecated
1679  'bgcolor', # deprecated
1680  ];
1681 
1682  # Numbers refer to sections in HTML 4.01 standard describing the element.
1683  # See: https://www.w3.org/TR/html4/
1684  $allowed = [
1685  # 7.5.4
1686  'div' => $block,
1687  'center' => $common, # deprecated
1688  'span' => $common,
1689 
1690  # 7.5.5
1691  'h1' => $block,
1692  'h2' => $block,
1693  'h3' => $block,
1694  'h4' => $block,
1695  'h5' => $block,
1696  'h6' => $block,
1697 
1698  # 7.5.6
1699  # address
1700 
1701  # 8.2.4
1702  'bdo' => $common,
1703 
1704  # 9.2.1
1705  'em' => $common,
1706  'strong' => $common,
1707  'cite' => $common,
1708  'dfn' => $common,
1709  'code' => $common,
1710  'samp' => $common,
1711  'kbd' => $common,
1712  'var' => $common,
1713  'abbr' => $common,
1714  # acronym
1715 
1716  # 9.2.2
1717  'blockquote' => $merge( $common, [ 'cite' ] ),
1718  'q' => $merge( $common, [ 'cite' ] ),
1719 
1720  # 9.2.3
1721  'sub' => $common,
1722  'sup' => $common,
1723 
1724  # 9.3.1
1725  'p' => $block,
1726 
1727  # 9.3.2
1728  'br' => $merge( $common, [ 'clear' ] ),
1729 
1730  # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1731  'wbr' => $common,
1732 
1733  # 9.3.4
1734  'pre' => $merge( $common, [ 'width' ] ),
1735 
1736  # 9.4
1737  'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1738  'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1739 
1740  # 10.2
1741  'ul' => $merge( $common, [ 'type' ] ),
1742  'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1743  'li' => $merge( $common, [ 'type', 'value' ] ),
1744 
1745  # 10.3
1746  'dl' => $common,
1747  'dd' => $common,
1748  'dt' => $common,
1749 
1750  # 11.2.1
1751  'table' => $merge( $common,
1752  [ 'summary', 'width', 'border', 'frame',
1753  'rules', 'cellspacing', 'cellpadding',
1754  'align', 'bgcolor',
1755  ] ),
1756 
1757  # 11.2.2
1758  'caption' => $block,
1759 
1760  # 11.2.3
1761  'thead' => $common,
1762  'tfoot' => $common,
1763  'tbody' => $common,
1764 
1765  # 11.2.4
1766  'colgroup' => $merge( $common, [ 'span' ] ),
1767  'col' => $merge( $common, [ 'span' ] ),
1768 
1769  # 11.2.5
1770  'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1771 
1772  # 11.2.6
1773  'td' => $merge( $common, $tablecell, $tablealign ),
1774  'th' => $merge( $common, $tablecell, $tablealign ),
1775 
1776  # 12.2
1777  # NOTE: <a> is not allowed directly, but this list of allowed
1778  # attributes is used from the Parser object
1779  'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1780 
1781  # 13.2
1782  # Not usually allowed, but may be used for extension-style hooks
1783  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1784  # true
1785  'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1786  # Attributes for A/V tags added in T163583 / T133673
1787  'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1788  'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1789  'source' => $merge( $common, [ 'type', 'src' ] ),
1790  'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1791 
1792  # 15.2.1
1793  'tt' => $common,
1794  'b' => $common,
1795  'i' => $common,
1796  'big' => $common,
1797  'small' => $common,
1798  'strike' => $common,
1799  's' => $common,
1800  'u' => $common,
1801 
1802  # 15.2.2
1803  'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1804  # basefont
1805 
1806  # 15.3
1807  'hr' => $merge( $common, [ 'width' ] ),
1808 
1809  # HTML Ruby annotation text module, simple ruby only.
1810  # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1811  'ruby' => $common,
1812  # rbc
1813  'rb' => $common,
1814  'rp' => $common,
1815  'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1816  'rtc' => $common,
1817 
1818  # MathML root element, where used for extensions
1819  # 'title' may not be 100% valid here; it's XHTML
1820  # https://www.w3.org/TR/REC-MathML/
1821  'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1822 
1823  // HTML 5 section 4.5
1824  'figure' => $common,
1825  'figure-inline' => $common, # T118520
1826  'figcaption' => $common,
1827 
1828  # HTML 5 section 4.6
1829  'bdi' => $common,
1830 
1831  # HTML5 elements, defined by:
1832  # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1833  'data' => $merge( $common, [ 'value' ] ),
1834  'time' => $merge( $common, [ 'datetime' ] ),
1835  'mark' => $common,
1836 
1837  // meta and link are only permitted by removeHTMLtags when Microdata
1838  // is enabled so we don't bother adding a conditional to hide these
1839  // Also meta and link are only valid in WikiText as Microdata elements
1840  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1841  // So we don't bother including $common attributes that have no purpose.
1842  'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1843  'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1844  ];
1845 
1846  return $allowed;
1847  }
1848 
1860  public static function stripAllTags( $html ) {
1861  // Use RemexHtml to tokenize $html and extract the text
1862  $handler = new RemexStripTagHandler;
1863  $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
1864  'ignoreErrors' => true,
1865  // don't ignore char refs, we want them to be decoded
1866  'ignoreNulls' => true,
1867  'skipPreprocess' => true,
1868  ] );
1869  $tokenizer->execute();
1870  $text = $handler->getResult();
1871 
1872  $text = self::normalizeWhitespace( $text );
1873  return $text;
1874  }
1875 
1885  public static function hackDocType() {
1886  $out = "<!DOCTYPE html [\n";
1887  foreach ( self::HTML_ENTITIES as $entity => $codepoint ) {
1888  $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1889  }
1890  $out .= "]>\n";
1891  return $out;
1892  }
1893 
1898  public static function cleanUrl( $url ) {
1899  # Normalize any HTML entities in input. They will be
1900  # re-escaped by makeExternalLink().
1901  $url = self::decodeCharReferences( $url );
1902 
1903  # Escape any control characters introduced by the above step
1904  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1905  [ __CLASS__, 'cleanUrlCallback' ], $url );
1906 
1907  # Validate hostname portion
1908  $matches = [];
1909  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1910  list( /* $whole */, $protocol, $host, $rest ) = $matches;
1911 
1912  // Characters that will be ignored in IDNs.
1913  // https://tools.ietf.org/html/rfc3454#section-3.1
1914  // Strip them before further processing so deny lists and such work.
1915  $strip = "/
1916  \\s| # general whitespace
1917  \xc2\xad| # 00ad SOFT HYPHEN
1918  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1919  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1920  \xe2\x81\xa0| # 2060 WORD JOINER
1921  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1922  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1923  \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1924  \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1925  \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1926  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1927  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1928  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
1929  /xuD";
1930 
1931  $host = preg_replace( $strip, '', $host );
1932 
1933  // IPv6 host names are bracketed with []. Url-decode these.
1934  if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 &&
1935  preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1936  ) {
1937  $host = '//[' . $matches[1] . ']' . $matches[2];
1938  }
1939 
1940  // @todo FIXME: Validate hostnames here
1941 
1942  return $protocol . $host . $rest;
1943  } else {
1944  return $url;
1945  }
1946  }
1947 
1952  private static function cleanUrlCallback( $matches ) {
1953  return urlencode( $matches[0] );
1954  }
1955 
1984  public static function validateEmail( $addr ) {
1985  $result = null;
1986  if ( !Hooks::runner()->onIsValidEmailAddr( $addr, $result ) ) {
1987  return $result;
1988  }
1989 
1990  // Please note strings below are enclosed in brackets [], this make the
1991  // hyphen "-" a range indicator. Hence it is double backslashed below.
1992  // See T28948
1993  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1994  $rfc1034_ldh_str = "a-z0-9\\-";
1995 
1996  $html5_email_regexp = "/
1997  ^ # start of string
1998  [$rfc5322_atext\\.]+ # user part which is liberal :p
1999  @ # 'apostrophe'
2000  [$rfc1034_ldh_str]+ # First domain part
2001  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
2002  $ # End of string
2003  /ix"; // case Insensitive, eXtended
2004 
2005  return (bool)preg_match( $html5_email_regexp, $addr );
2006  }
2007 }
Sanitizer\ID_FALLBACK
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
Definition: Sanitizer.php:75
Sanitizer\normalizeEntity
static normalizeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the equivalent numeric entity re...
Definition: Sanitizer.php:1452
Sanitizer\getTagAttributeCallback
static getTagAttributeCallback( $set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
Definition: Sanitizer.php:1357
Sanitizer\removeHTMLcomments
static removeHTMLcomments( $text)
Remove '', and everything between.
Definition: Sanitizer.php:565
Sanitizer\stripAllTags
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
Definition: Sanitizer.php:1860
Sanitizer\EVIL_URI_PATTERN
const EVIL_URI_PATTERN
Pattern matching evil uris like javascript: WARNING: DO NOT use this in any place that actually requi...
Definition: Sanitizer.php:59
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:152
$wgExternalInterwikiFragmentMode
$wgExternalInterwikiFragmentMode
Which ID escaping mode should be used for external interwiki links? See documentation for $wgFragment...
Definition: DefaultSettings.php:3584
Sanitizer\escapeIdForAttribute
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
Definition: Sanitizer.php:1120
Sanitizer\removeHTMLtags
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
Definition: Sanitizer.php:496
Sanitizer\decodeEntity
static decodeEntity( $name)
If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, return the UTF-8 encoding of that chara...
Definition: Sanitizer.php:1587
Sanitizer\mergeAttributes
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:798
Sanitizer\validateAttributes
static validateAttributes( $attribs, $allowed)
Take an array of attribute names and values and normalize or discard illegal values.
Definition: Sanitizer.php:670
Sanitizer\normalizeSectionNameWhitespace
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
Definition: Sanitizer.php:1396
Sanitizer\escapeIdInternal
static escapeIdInternal( $id, $mode)
Helper for escapeIdFor*() functions.
Definition: Sanitizer.php:1202
Sanitizer\validateEmail
static validateEmail( $addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:1984
Sanitizer\attributesAllowedInternal
static attributesAllowedInternal( $element)
Fetch the list of acceptable attributes for a given element name.
Definition: Sanitizer.php:1605
Sanitizer\decCharReference
static decCharReference( $codepoint)
Definition: Sanitizer.php:1468
Sanitizer\safeEncodeTagAttributes
static safeEncodeTagAttributes( $assoc_array)
Build a partial tag string from an associative array of attribute names and values as returned by dec...
Definition: Sanitizer.php:1338
Sanitizer\decodeCharReferencesAndNormalize
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1532
Sanitizer\$attribsRegex
static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
Definition: Sanitizer.php:349
$wgFragmentMode
$wgFragmentMode
How should section IDs be encoded? This array can contain 1 or 2 elements, each of them can be one of...
Definition: DefaultSettings.php:3574
Sanitizer\escapeClass
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:1264
Sanitizer\normalizeCharReferencesCallback
static normalizeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1426
Sanitizer\armorFrenchSpaces
static armorFrenchSpaces( $text, $space='&#160;')
Armor French spaces with a replacement character.
Definition: Sanitizer.php:1003
RemexStripTagHandler
Definition: RemexStripTagHandler.php:9
Sanitizer\validateTag
static validateTag( $params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
Definition: Sanitizer.php:611
Sanitizer\$attribNameRegex
static $attribNameRegex
Lazy-initialised attribute name regex, see getAttribNameRegex()
Definition: Sanitizer.php:380
Sanitizer\hackDocType
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1885
Sanitizer\XMLNS_ATTRIBUTE_PATTERN
const XMLNS_ATTRIBUTE_PATTERN
Definition: Sanitizer.php:60
$wgAllowImageTag
$wgAllowImageTag
A different approach to the above: simply allow the "<img>" tag to be used.
Definition: DefaultSettings.php:4630
wfDeprecatedMsg
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
Definition: GlobalFunctions.php:1058
MWException
MediaWiki exception.
Definition: MWException.php:28
wfDeprecated
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that $function is deprecated.
Definition: GlobalFunctions.php:1026
Sanitizer\safeEncodeAttribute
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
Definition: Sanitizer.php:1023
$matches
$matches
Definition: NoLocalSettings.php:24
Sanitizer\encodeAttribute
static encodeAttribute( $text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:980
$args
if( $line===false) $args
Definition: mcc.php:124
Sanitizer\hexCharReference
static hexCharReference( $codepoint)
Definition: Sanitizer.php:1481
Sanitizer\validateCodepoint
static validateCodepoint( $codepoint)
Returns true if a given Unicode codepoint is a valid character in both HTML5 and XML.
Definition: Sanitizer.php:1496
wfUrlProtocols
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
Definition: GlobalFunctions.php:718
Sanitizer\HTML_ENTITY_ALIASES
const HTML_ENTITY_ALIASES
Character entity aliases accepted by MediaWiki.
Definition: Sanitizer.php:341
Sanitizer\validateTagAttributes
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:647
Sanitizer\ELEMENT_BITS_REGEX
const ELEMENT_BITS_REGEX
Acceptable tag name charset from HTML5 parsing spec https://www.w3.org/TR/html5/syntax....
Definition: Sanitizer.php:48
Sanitizer\HTML_ENTITIES
const HTML_ENTITIES
List of all named character entities defined in HTML 4.01 https://www.w3.org/TR/html4/sgml/entities....
Definition: Sanitizer.php:82
Hooks\runner
static runner()
Get a HookRunner instance for calling hooks using the new interfaces.
Definition: Hooks.php:172
Sanitizer\escapeIdForExternalInterwiki
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:1170
Sanitizer\isReservedDataAttribute
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
Definition: Sanitizer.php:777
Sanitizer\cleanUrl
static cleanUrl( $url)
Definition: Sanitizer.php:1898
Sanitizer\setupAttributesAllowedInternal
static setupAttributesAllowedInternal()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
Definition: Sanitizer.php:1617
Sanitizer\cssDecodeCallback
static cssDecodeCallback( $matches)
Definition: Sanitizer.php:918
Sanitizer\getRecognizedTagData
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
Definition: Sanitizer.php:401
Sanitizer\fixTagAttributes
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
Definition: Sanitizer.php:960
Sanitizer\escapeIdInternalUrl
static escapeIdInternalUrl( $id, $mode)
Do percent encoding of percent signs for href (but not id) attributes.
Definition: Sanitizer.php:1187
Sanitizer\normalizeCss
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
Definition: Sanitizer.php:819
Sanitizer\ID_PRIMARY
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
Definition: Sanitizer.php:67
Sanitizer\CHAR_REFS_REGEX
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:38
Sanitizer\normalizeWhitespace
static normalizeWhitespace( $text)
Definition: Sanitizer.php:1381
Sanitizer\decodeChar
static decodeChar( $codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1571
Sanitizer\getAttribNameRegex
static getAttribNameRegex()
Used in Sanitizer::decodeTagAttributes to filter attributes.
Definition: Sanitizer.php:386
StringUtils\delimiterReplace
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
Definition: StringUtils.php:248
Sanitizer\normalizeCharReferences
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1415
Sanitizer\escapeIdForLink
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:1147
Sanitizer\decodeTagAttributes
static decodeTagAttributes( $text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:1296
$t
$t
Definition: testCompression.php:74
Sanitizer\decodeCharReferences
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:1515
Sanitizer
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:33
Sanitizer\checkCss
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:889
Sanitizer\escapeId
static escapeId( $id, $options=[])
Given a value, escape it so that it can be used in an id attribute and return it.
Definition: Sanitizer.php:1085
Sanitizer\escapeIdReferenceList
static escapeIdReferenceList( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
Definition: Sanitizer.php:1237
Sanitizer\getAttribsRegex
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:357
Sanitizer\escapeHtmlAllowEntities
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1279
Sanitizer\decodeCharReferencesCallback
static decodeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1552
Sanitizer\cleanUrlCallback
static cleanUrlCallback( $matches)
Definition: Sanitizer.php:1952