MediaWiki  master
Sanitizer.php
Go to the documentation of this file.
1 <?php
28 
33 class Sanitizer {
38  const CHAR_REFS_REGEX =
39  '/&([A-Za-z0-9\x80-\xff]+);
40  |&\#([0-9]+);
41  |&\#[xX]([0-9A-Fa-f]+);
42  |(&)/x';
43 
48  const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
49 
58  const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
59  const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
60 
66  const ID_PRIMARY = 0;
67 
74  const ID_FALLBACK = 1;
75 
81  private const HTML_ENTITIES = [
82  'Aacute' => 193,
83  'aacute' => 225,
84  'Acirc' => 194,
85  'acirc' => 226,
86  'acute' => 180,
87  'AElig' => 198,
88  'aelig' => 230,
89  'Agrave' => 192,
90  'agrave' => 224,
91  'alefsym' => 8501,
92  'Alpha' => 913,
93  'alpha' => 945,
94  'amp' => 38,
95  'and' => 8743,
96  'ang' => 8736,
97  'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
98  'Aring' => 197,
99  'aring' => 229,
100  'asymp' => 8776,
101  'Atilde' => 195,
102  'atilde' => 227,
103  'Auml' => 196,
104  'auml' => 228,
105  'bdquo' => 8222,
106  'Beta' => 914,
107  'beta' => 946,
108  'brvbar' => 166,
109  'bull' => 8226,
110  'cap' => 8745,
111  'Ccedil' => 199,
112  'ccedil' => 231,
113  'cedil' => 184,
114  'cent' => 162,
115  'Chi' => 935,
116  'chi' => 967,
117  'circ' => 710,
118  'clubs' => 9827,
119  'cong' => 8773,
120  'copy' => 169,
121  'crarr' => 8629,
122  'cup' => 8746,
123  'curren' => 164,
124  'dagger' => 8224,
125  'Dagger' => 8225,
126  'darr' => 8595,
127  'dArr' => 8659,
128  'deg' => 176,
129  'Delta' => 916,
130  'delta' => 948,
131  'diams' => 9830,
132  'divide' => 247,
133  'Eacute' => 201,
134  'eacute' => 233,
135  'Ecirc' => 202,
136  'ecirc' => 234,
137  'Egrave' => 200,
138  'egrave' => 232,
139  'empty' => 8709,
140  'emsp' => 8195,
141  'ensp' => 8194,
142  'Epsilon' => 917,
143  'epsilon' => 949,
144  'equiv' => 8801,
145  'Eta' => 919,
146  'eta' => 951,
147  'ETH' => 208,
148  'eth' => 240,
149  'Euml' => 203,
150  'euml' => 235,
151  'euro' => 8364,
152  'exist' => 8707,
153  'fnof' => 402,
154  'forall' => 8704,
155  'frac12' => 189,
156  'frac14' => 188,
157  'frac34' => 190,
158  'frasl' => 8260,
159  'Gamma' => 915,
160  'gamma' => 947,
161  'ge' => 8805,
162  'gt' => 62,
163  'harr' => 8596,
164  'hArr' => 8660,
165  'hearts' => 9829,
166  'hellip' => 8230,
167  'Iacute' => 205,
168  'iacute' => 237,
169  'Icirc' => 206,
170  'icirc' => 238,
171  'iexcl' => 161,
172  'Igrave' => 204,
173  'igrave' => 236,
174  'image' => 8465,
175  'infin' => 8734,
176  'int' => 8747,
177  'Iota' => 921,
178  'iota' => 953,
179  'iquest' => 191,
180  'isin' => 8712,
181  'Iuml' => 207,
182  'iuml' => 239,
183  'Kappa' => 922,
184  'kappa' => 954,
185  'Lambda' => 923,
186  'lambda' => 955,
187  'lang' => 9001,
188  'laquo' => 171,
189  'larr' => 8592,
190  'lArr' => 8656,
191  'lceil' => 8968,
192  'ldquo' => 8220,
193  'le' => 8804,
194  'lfloor' => 8970,
195  'lowast' => 8727,
196  'loz' => 9674,
197  'lrm' => 8206,
198  'lsaquo' => 8249,
199  'lsquo' => 8216,
200  'lt' => 60,
201  'macr' => 175,
202  'mdash' => 8212,
203  'micro' => 181,
204  'middot' => 183,
205  'minus' => 8722,
206  'Mu' => 924,
207  'mu' => 956,
208  'nabla' => 8711,
209  'nbsp' => 160,
210  'ndash' => 8211,
211  'ne' => 8800,
212  'ni' => 8715,
213  'not' => 172,
214  'notin' => 8713,
215  'nsub' => 8836,
216  'Ntilde' => 209,
217  'ntilde' => 241,
218  'Nu' => 925,
219  'nu' => 957,
220  'Oacute' => 211,
221  'oacute' => 243,
222  'Ocirc' => 212,
223  'ocirc' => 244,
224  'OElig' => 338,
225  'oelig' => 339,
226  'Ograve' => 210,
227  'ograve' => 242,
228  'oline' => 8254,
229  'Omega' => 937,
230  'omega' => 969,
231  'Omicron' => 927,
232  'omicron' => 959,
233  'oplus' => 8853,
234  'or' => 8744,
235  'ordf' => 170,
236  'ordm' => 186,
237  'Oslash' => 216,
238  'oslash' => 248,
239  'Otilde' => 213,
240  'otilde' => 245,
241  'otimes' => 8855,
242  'Ouml' => 214,
243  'ouml' => 246,
244  'para' => 182,
245  'part' => 8706,
246  'permil' => 8240,
247  'perp' => 8869,
248  'Phi' => 934,
249  'phi' => 966,
250  'Pi' => 928,
251  'pi' => 960,
252  'piv' => 982,
253  'plusmn' => 177,
254  'pound' => 163,
255  'prime' => 8242,
256  'Prime' => 8243,
257  'prod' => 8719,
258  'prop' => 8733,
259  'Psi' => 936,
260  'psi' => 968,
261  'quot' => 34,
262  'radic' => 8730,
263  'rang' => 9002,
264  'raquo' => 187,
265  'rarr' => 8594,
266  'rArr' => 8658,
267  'rceil' => 8969,
268  'rdquo' => 8221,
269  'real' => 8476,
270  'reg' => 174,
271  'rfloor' => 8971,
272  'Rho' => 929,
273  'rho' => 961,
274  'rlm' => 8207,
275  'rsaquo' => 8250,
276  'rsquo' => 8217,
277  'sbquo' => 8218,
278  'Scaron' => 352,
279  'scaron' => 353,
280  'sdot' => 8901,
281  'sect' => 167,
282  'shy' => 173,
283  'Sigma' => 931,
284  'sigma' => 963,
285  'sigmaf' => 962,
286  'sim' => 8764,
287  'spades' => 9824,
288  'sub' => 8834,
289  'sube' => 8838,
290  'sum' => 8721,
291  'sup' => 8835,
292  'sup1' => 185,
293  'sup2' => 178,
294  'sup3' => 179,
295  'supe' => 8839,
296  'szlig' => 223,
297  'Tau' => 932,
298  'tau' => 964,
299  'there4' => 8756,
300  'Theta' => 920,
301  'theta' => 952,
302  'thetasym' => 977,
303  'thinsp' => 8201,
304  'THORN' => 222,
305  'thorn' => 254,
306  'tilde' => 732,
307  'times' => 215,
308  'trade' => 8482,
309  'Uacute' => 218,
310  'uacute' => 250,
311  'uarr' => 8593,
312  'uArr' => 8657,
313  'Ucirc' => 219,
314  'ucirc' => 251,
315  'Ugrave' => 217,
316  'ugrave' => 249,
317  'uml' => 168,
318  'upsih' => 978,
319  'Upsilon' => 933,
320  'upsilon' => 965,
321  'Uuml' => 220,
322  'uuml' => 252,
323  'weierp' => 8472,
324  'Xi' => 926,
325  'xi' => 958,
326  'Yacute' => 221,
327  'yacute' => 253,
328  'yen' => 165,
329  'Yuml' => 376,
330  'yuml' => 255,
331  'Zeta' => 918,
332  'zeta' => 950,
333  'zwj' => 8205,
334  'zwnj' => 8204
335  ];
336 
340  private const HTML_ENTITY_ALIASES = [
341  'רלמ' => 'rlm',
342  'رلم' => 'rlm',
343  ];
344 
348  private static $attribsRegex;
349 
356  static function getAttribsRegex() {
357  if ( self::$attribsRegex === null ) {
358  $spaceChars = '\x09\x0a\x0c\x0d\x20';
359  $space = "[{$spaceChars}]";
360  $attrib = "[^{$spaceChars}\/>=]";
361  $attribFirst = "(?:{$attrib}|=)";
362  self::$attribsRegex =
363  "/({$attribFirst}{$attrib}*)
364  ($space*=$space*
365  (?:
366  # The attribute value: quoted or alone
367  \"([^\"]*)(?:\"|\$)
368  | '([^']*)(?:'|\$)
369  | (((?!$space|>).)*)
370  )
371  )?/sxu";
372  }
373  return self::$attribsRegex;
374  }
375 
379  private static $attribNameRegex;
380 
385  static function getAttribNameRegex() {
386  if ( self::$attribNameRegex === null ) {
387  $attribFirst = "[:_\p{L}\p{N}]";
388  $attrib = "[:_\.\-\p{L}\p{N}]";
389  self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
390  }
391  return self::$attribNameRegex;
392  }
393 
400  public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
401  global $wgAllowImageTag;
402 
403  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
404  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
405 
406  // Base our staticInitialised variable off of the global config state so that if the globals
407  // are changed (like in the screwed up test system) we will re-initialise the settings.
408  $globalContext = $wgAllowImageTag;
409  if ( !$staticInitialised || $staticInitialised != $globalContext ) {
410  $htmlpairsStatic = [ # Tags that must be closed
411  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
412  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
413  'strike', 'strong', 'tt', 'var', 'div', 'center',
414  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
415  'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
416  'kbd', 'samp', 'data', 'time', 'mark'
417  ];
418  $htmlsingle = [
419  'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
420  ];
421 
422  # Elements that cannot have close tags. This is (not coincidentally)
423  # also the list of tags for which the HTML 5 parsing algorithm
424  # requires you to "acknowledge the token's self-closing flag", i.e.
425  # a self-closing tag like <br/> is not an HTML 5 parse error only
426  # for this list.
427  $htmlsingleonly = [
428  'br', 'wbr', 'hr', 'meta', 'link'
429  ];
430 
431  $htmlnest = [ # Tags that can be nested--??
432  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
433  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
434  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
435  ];
436  $tabletags = [ # Can only appear inside table, we will close them
437  'td', 'th', 'tr',
438  ];
439  $htmllist = [ # Tags used by list
440  'ul', 'ol',
441  ];
442  $listtags = [ # Tags that can appear in a list
443  'li',
444  ];
445 
446  if ( $wgAllowImageTag ) {
447  $htmlsingle[] = 'img';
448  $htmlsingleonly[] = 'img';
449  }
450 
451  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
452  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
453 
454  # Convert them all to hashtables for faster lookup
455  $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
456  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
457  foreach ( $vars as $var ) {
458  $$var = array_flip( $$var );
459  }
460  $staticInitialised = $globalContext;
461  }
462 
463  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
464  $extratags = array_flip( $extratags );
465  $removetags = array_flip( $removetags );
466  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
467  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
468 
469  return [
470  'htmlpairs' => $htmlpairs,
471  'htmlsingle' => $htmlsingle,
472  'htmlsingleonly' => $htmlsingleonly,
473  'htmlnest' => $htmlnest,
474  'tabletags' => $tabletags,
475  'htmllist' => $htmllist,
476  'listtags' => $listtags,
477  'htmlsingleallowed' => $htmlsingleallowed,
478  'htmlelements' => $htmlelements,
479  ];
480  }
481 
497  public static function removeHTMLtags( $text, $processCallback = null,
498  $args = [], $extratags = [], $removetags = [], $warnCallback = null
499  ) {
500  $tagData = self::getRecognizedTagData( $extratags, $removetags );
501  $htmlpairs = $tagData['htmlpairs'];
502  $htmlsingle = $tagData['htmlsingle'];
503  $htmlsingleonly = $tagData['htmlsingleonly'];
504  $htmlnest = $tagData['htmlnest'];
505  $tabletags = $tagData['tabletags'];
506  $htmllist = $tagData['htmllist'];
507  $listtags = $tagData['listtags'];
508  $htmlsingleallowed = $tagData['htmlsingleallowed'];
509  $htmlelements = $tagData['htmlelements'];
510 
511  # Remove HTML comments
512  $text = self::removeHTMLcomments( $text );
513  $bits = explode( '<', $text );
514  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
515  if ( !MWTidy::isEnabled() ) {
516  wfDeprecated( 'disabling tidy', '1.33' );
517  $tagstack = $tablestack = [];
518  foreach ( $bits as $x ) {
519  $regs = [];
520  # $slash: Does the current element start with a '/'?
521  # $t: Current element name
522  # $params: String between element name and >
523  # $brace: Ending '>' or '/>'
524  # $rest: Everything until the next element of $bits
525  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
526  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
527  } else {
528  $slash = $t = $params = $brace = $rest = null;
529  }
530 
531  $badtag = false;
532  $t = strtolower( $t );
533  if ( isset( $htmlelements[$t] ) ) {
534  # Check our stack
535  if ( $slash && isset( $htmlsingleonly[$t] ) ) {
536  $badtag = true;
537  } elseif ( $slash ) {
538  # Closing a tag... is it the one we just opened?
539  Wikimedia\suppressWarnings();
540  $ot = array_pop( $tagstack );
541  Wikimedia\restoreWarnings();
542 
543  if ( $ot != $t ) {
544  if ( isset( $htmlsingleallowed[$ot] ) ) {
545  # Pop all elements with an optional close tag
546  # and see if we find a match below them
547  $optstack = [];
548  array_push( $optstack, $ot );
549  Wikimedia\suppressWarnings();
550  $ot = array_pop( $tagstack );
551  Wikimedia\restoreWarnings();
552  while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
553  array_push( $optstack, $ot );
554  Wikimedia\suppressWarnings();
555  $ot = array_pop( $tagstack );
556  Wikimedia\restoreWarnings();
557  }
558  if ( $t != $ot ) {
559  # No match. Push the optional elements back again
560  $badtag = true;
561  Wikimedia\suppressWarnings();
562  $ot = array_pop( $optstack );
563  Wikimedia\restoreWarnings();
564  while ( $ot ) {
565  array_push( $tagstack, $ot );
566  Wikimedia\suppressWarnings();
567  $ot = array_pop( $optstack );
568  Wikimedia\restoreWarnings();
569  }
570  }
571  } else {
572  Wikimedia\suppressWarnings();
573  array_push( $tagstack, $ot );
574  Wikimedia\restoreWarnings();
575 
576  # <li> can be nested in <ul> or <ol>, skip those cases:
577  if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
578  $badtag = true;
579  }
580  }
581  } elseif ( $t == 'table' ) {
582  $tagstack = array_pop( $tablestack );
583  }
584  $newparams = '';
585  } else {
586  # Keep track for later
587  if ( isset( $tabletags[$t] ) && !in_array( 'table', $tagstack ) ) {
588  $badtag = true;
589  } elseif ( in_array( $t, $tagstack ) && !isset( $htmlnest[$t] ) ) {
590  $badtag = true;
591  #  Is it a self closed htmlpair ? (T7487)
592  } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) {
593  // Eventually we'll just remove the self-closing
594  // slash, in order to be consistent with HTML5
595  // semantics.
596  // $brace = '>';
597  // For now, let's just warn authors to clean up.
598  if ( is_callable( $warnCallback ) ) {
599  call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
600  }
601  $badtag = true;
602  } elseif ( isset( $htmlsingleonly[$t] ) ) {
603  # Hack to force empty tag for unclosable elements
604  $brace = '/>';
605  } elseif ( isset( $htmlsingle[$t] ) ) {
606  # Hack to not close $htmlsingle tags
607  $brace = null;
608  # Still need to push this optionally-closed tag to
609  # the tag stack so that we can match end tags
610  # instead of marking them as bad.
611  array_push( $tagstack, $t );
612  } elseif ( isset( $tabletags[$t] ) && in_array( $t, $tagstack ) ) {
613  // New table tag but forgot to close the previous one
614  $text .= "</$t>";
615  } else {
616  if ( $t == 'table' ) {
617  array_push( $tablestack, $tagstack );
618  $tagstack = [];
619  }
620  array_push( $tagstack, $t );
621  }
622 
623  # Replace any variables or template parameters with
624  # plaintext results.
625  if ( is_callable( $processCallback ) ) {
626  call_user_func_array( $processCallback, [ &$params, $args ] );
627  }
628 
629  if ( !self::validateTag( $params, $t ) ) {
630  $badtag = true;
631  }
632 
633  # Strip non-approved attributes from the tag
634  $newparams = self::fixTagAttributes( $params, $t );
635  }
636  if ( !$badtag ) {
637  $rest = str_replace( '>', '&gt;', $rest );
638  $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
639  $text .= "<$slash$t$newparams$close>$rest";
640  continue;
641  }
642  }
643  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
644  }
645  # Close off any remaining tags
646  while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
647  $text .= "</$t>\n";
648  if ( $t == 'table' ) {
649  $tagstack = array_pop( $tablestack );
650  }
651  }
652  } else {
653  # this might be possible using tidy itself
654  foreach ( $bits as $x ) {
655  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
656  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
657 
658  $badtag = false;
659  $t = strtolower( $t );
660  if ( isset( $htmlelements[$t] ) ) {
661  if ( is_callable( $processCallback ) ) {
662  call_user_func_array( $processCallback, [ &$params, $args ] );
663  }
664 
665  if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
666  // Eventually we'll just remove the self-closing
667  // slash, in order to be consistent with HTML5
668  // semantics.
669  // $brace = '>';
670  // For now, let's just warn authors to clean up.
671  if ( is_callable( $warnCallback ) ) {
672  call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
673  }
674  }
675  if ( !self::validateTag( $params, $t ) ) {
676  $badtag = true;
677  }
678 
679  $newparams = self::fixTagAttributes( $params, $t );
680  if ( !$badtag ) {
681  if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
682  # Interpret self-closing tags as empty tags even when
683  # HTML 5 would interpret them as start tags. Such input
684  # is commonly seen on Wikimedia wikis with this intention.
685  $brace = "></$t>";
686  }
687 
688  $rest = str_replace( '>', '&gt;', $rest );
689  $text .= "<$slash$t$newparams$brace$rest";
690  continue;
691  }
692  }
693  }
694  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
695  }
696  }
697  return $text;
698  }
699 
709  public static function removeHTMLcomments( $text ) {
710  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
711  $end = strpos( $text, '-->', $start + 4 );
712  if ( $end === false ) {
713  # Unterminated comment; bail out
714  break;
715  }
716 
717  $end += 3;
718 
719  # Trim space and newline if the comment is both
720  # preceded and followed by a newline
721  $spaceStart = max( $start - 1, 0 );
722  $spaceLen = $end - $spaceStart;
723  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
724  $spaceStart--;
725  $spaceLen++;
726  }
727  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
728  $spaceLen++;
729  }
730  if ( substr( $text, $spaceStart, 1 ) === "\n"
731  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
732  # Remove the comment, leading and trailing
733  # spaces, and leave only one newline.
734  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
735  } else {
736  # Remove just the comment.
737  $text = substr_replace( $text, '', $start, $end - $start );
738  }
739  }
740  return $text;
741  }
742 
755  static function validateTag( $params, $element ) {
756  $params = self::decodeTagAttributes( $params );
757 
758  if ( $element == 'meta' || $element == 'link' ) {
759  if ( !isset( $params['itemprop'] ) ) {
760  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
761  return false;
762  }
763  if ( $element == 'meta' && !isset( $params['content'] ) ) {
764  // <meta> must have a content="" for the itemprop
765  return false;
766  }
767  if ( $element == 'link' && !isset( $params['href'] ) ) {
768  // <link> must have an associated href=""
769  return false;
770  }
771  }
772 
773  return true;
774  }
775 
791  static function validateTagAttributes( $attribs, $element ) {
792  return self::validateAttributes( $attribs,
793  self::attributeWhitelistInternal( $element ) );
794  }
795 
813  static function validateAttributes( $attribs, $whitelist ) {
814  if ( isset( $whitelist[0] ) ) {
815  // We would like to eventually deprecate calling this
816  // function with a sequential array, but for now just
817  // convert it.
818  $whitelist = array_flip( $whitelist );
819  }
820  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
821 
822  $out = [];
823  foreach ( $attribs as $attribute => $value ) {
824  # Allow XML namespace declaration to allow RDFa
825  if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
826  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
827  $out[$attribute] = $value;
828  }
829 
830  continue;
831  }
832 
833  # Allow any attribute beginning with "data-"
834  # However:
835  # * Disallow data attributes used by MediaWiki code
836  # * Ensure that the attribute is not namespaced by banning
837  # colons.
838  if ( (
839  !preg_match( '/^data-[^:]*$/i', $attribute ) &&
840  !array_key_exists( $attribute, $whitelist )
841  ) || self::isReservedDataAttribute( $attribute ) ) {
842  continue;
843  }
844 
845  # Strip javascript "expression" from stylesheets.
846  # https://msdn.microsoft.com/en-us/library/ms537634.aspx
847  if ( $attribute == 'style' ) {
848  $value = self::checkCss( $value );
849  }
850 
851  # Escape HTML id attributes
852  if ( $attribute === 'id' ) {
853  $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
854  }
855 
856  # Escape HTML id reference lists
857  if ( $attribute === 'aria-describedby'
858  || $attribute === 'aria-flowto'
859  || $attribute === 'aria-labelledby'
860  || $attribute === 'aria-owns'
861  ) {
862  $value = self::escapeIdReferenceList( $value );
863  }
864 
865  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
866  // Check them for sanity.
867  if ( $attribute === 'rel' || $attribute === 'rev'
868  # RDFa
869  || $attribute === 'about' || $attribute === 'property'
870  || $attribute === 'resource' || $attribute === 'datatype'
871  || $attribute === 'typeof'
872  # HTML5 microdata
873  || $attribute === 'itemid' || $attribute === 'itemprop'
874  || $attribute === 'itemref' || $attribute === 'itemscope'
875  || $attribute === 'itemtype'
876  ) {
877  // Paranoia. Allow "simple" values but suppress javascript
878  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
879  continue;
880  }
881  }
882 
883  # NOTE: even though elements using href/src are not allowed directly, supply
884  # validation code that can be used by tag hook handlers, etc
885  if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
886  if ( !preg_match( $hrefExp, $value ) ) {
887  continue; // drop any href or src attributes not using an allowed protocol.
888  // NOTE: this also drops all relative URLs
889  }
890  }
891 
892  // If this attribute was previously set, override it.
893  // Output should only have one attribute of each name.
894  $out[$attribute] = $value;
895  }
896 
897  # itemtype, itemid, itemref don't make sense without itemscope
898  if ( !array_key_exists( 'itemscope', $out ) ) {
899  unset( $out['itemtype'] );
900  unset( $out['itemid'] );
901  unset( $out['itemref'] );
902  }
903  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
904 
905  return $out;
906  }
907 
915  public static function isReservedDataAttribute( $attr ) {
916  // data-ooui is reserved for ooui.
917  // data-mw and data-parsoid are reserved for parsoid.
918  // data-mw-<name here> is reserved for extensions (or core) if
919  // they need to communicate some data to the client and want to be
920  // sure that it isn't coming from an untrusted user.
921  // We ignore the possibility of namespaces since user-generated HTML
922  // can't use them anymore.
923  return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
924  }
925 
936  static function mergeAttributes( $a, $b ) {
937  $out = array_merge( $a, $b );
938  if ( isset( $a['class'] ) && isset( $b['class'] )
939  && is_string( $a['class'] ) && is_string( $b['class'] )
940  && $a['class'] !== $b['class']
941  ) {
942  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
943  -1, PREG_SPLIT_NO_EMPTY );
944  $out['class'] = implode( ' ', array_unique( $classes ) );
945  }
946  return $out;
947  }
948 
957  public static function normalizeCss( $value ) {
958  // Decode character references like &#123;
959  $value = self::decodeCharReferences( $value );
960 
961  // Decode escape sequences and line continuation
962  // See the grammar in the CSS 2 spec, appendix D.
963  // This has to be done AFTER decoding character references.
964  // This means it isn't possible for this function to return
965  // unsanitized escape sequences. It is possible to manufacture
966  // input that contains character references that decode to
967  // escape sequences that decode to character references, but
968  // it's OK for the return value to contain character references
969  // because the caller is supposed to escape those anyway.
970  static $decodeRegex;
971  if ( !$decodeRegex ) {
972  $space = '[\\x20\\t\\r\\n\\f]';
973  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
974  $backslash = '\\\\';
975  $decodeRegex = "/ $backslash
976  (?:
977  ($nl) | # 1. Line continuation
978  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
979  (.) | # 3. backslash cancelling special meaning
980  () | # 4. backslash at end of string
981  )/xu";
982  }
983  $value = preg_replace_callback( $decodeRegex,
984  [ __CLASS__, 'cssDecodeCallback' ], $value );
985 
986  // Let the value through if it's nothing but a single comment, to
987  // allow other functions which may reject it to pass some error
988  // message through.
989  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
990  // Remove any comments; IE gets token splitting wrong
991  // This must be done AFTER decoding character references and
992  // escape sequences, because those steps can introduce comments
993  // This step cannot introduce character references or escape
994  // sequences, because it replaces comments with spaces rather
995  // than removing them completely.
996  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
997 
998  // Remove anything after a comment-start token, to guard against
999  // incorrect client implementations.
1000  $commentPos = strpos( $value, '/*' );
1001  if ( $commentPos !== false ) {
1002  $value = substr( $value, 0, $commentPos );
1003  }
1004  }
1005 
1006  return $value;
1007  }
1008 
1027  static function checkCss( $value ) {
1028  $value = self::normalizeCss( $value );
1029 
1030  // Reject problematic keywords and control characters
1031  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
1032  strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
1033  return '/* invalid control char */';
1034  } elseif ( preg_match(
1035  '! expression
1036  | filter\s*:
1037  | accelerator\s*:
1038  | -o-link\s*:
1039  | -o-link-source\s*:
1040  | -o-replace\s*:
1041  | url\s*\(
1042  | image\s*\(
1043  | image-set\s*\(
1044  | attr\s*\([^)]+[\s,]+url
1045  | var\s*\(
1046  !ix', $value ) ) {
1047  return '/* insecure input */';
1048  }
1049  return $value;
1050  }
1051 
1056  static function cssDecodeCallback( $matches ) {
1057  if ( $matches[1] !== '' ) {
1058  // Line continuation
1059  return '';
1060  } elseif ( $matches[2] !== '' ) {
1061  $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
1062  } elseif ( $matches[3] !== '' ) {
1063  $char = $matches[3];
1064  } else {
1065  $char = '\\';
1066  }
1067  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
1068  // These characters need to be escaped in strings
1069  // Clean up the escape sequence to avoid parsing errors by clients
1070  return '\\' . dechex( ord( $char ) ) . ' ';
1071  } else {
1072  // Decode unnecessary escape
1073  return $char;
1074  }
1075  }
1076 
1098  static function fixTagAttributes( $text, $element, $sorted = false ) {
1099  if ( trim( $text ) == '' ) {
1100  return '';
1101  }
1102 
1103  $decoded = self::decodeTagAttributes( $text );
1104  $stripped = self::validateTagAttributes( $decoded, $element );
1105 
1106  if ( $sorted ) {
1107  ksort( $stripped );
1108  }
1109 
1110  return self::safeEncodeTagAttributes( $stripped );
1111  }
1112 
1118  static function encodeAttribute( $text ) {
1119  $encValue = htmlspecialchars( $text, ENT_QUOTES );
1120 
1121  // Whitespace is normalized during attribute decoding,
1122  // so if we've been passed non-spaces we must encode them
1123  // ahead of time or they won't be preserved.
1124  $encValue = strtr( $encValue, [
1125  "\n" => '&#10;',
1126  "\r" => '&#13;',
1127  "\t" => '&#9;',
1128  ] );
1129 
1130  return $encValue;
1131  }
1132 
1141  public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
1142  // Replace $ with \$ and \ with \\
1143  $space = preg_replace( '#(?<!\\\\)(\\$|\\\\)#', '\\\\$1', $space );
1144  $fixtags = [
1145  # French spaces, last one Guillemet-left
1146  # only if there is something before the space
1147  # and a non-word character after the punctuation.
1148  '/(?<=\S) (?=[?:;!%»›](?!\w))/u' => "$space",
1149  # French spaces, Guillemet-right
1150  '/([«‹]) /u' => "\\1$space",
1151  ];
1152  return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
1153  }
1154 
1161  static function safeEncodeAttribute( $text ) {
1162  $encValue = self::encodeAttribute( $text );
1163 
1164  # Templates and links may be expanded in later parsing,
1165  # creating invalid or dangerous output. Suppress this.
1166  $encValue = strtr( $encValue, [
1167  '<' => '&lt;', // This should never happen,
1168  '>' => '&gt;', // we've received invalid input
1169  '"' => '&quot;', // which should have been escaped.
1170  '{' => '&#123;',
1171  '}' => '&#125;', // prevent unpaired language conversion syntax
1172  '[' => '&#91;',
1173  ']' => '&#93;',
1174  "''" => '&#39;&#39;',
1175  'ISBN' => '&#73;SBN',
1176  'RFC' => '&#82;FC',
1177  'PMID' => '&#80;MID',
1178  '|' => '&#124;',
1179  '__' => '&#95;_',
1180  ] );
1181 
1182  # Armor against French spaces detection (T5158)
1183  $encValue = self::armorFrenchSpaces( $encValue, '&#32;' );
1184 
1185  # Stupid hack
1186  $encValue = preg_replace_callback(
1187  '/((?i)' . wfUrlProtocols() . ')/',
1188  function ( $matches ) {
1189  return str_replace( ':', '&#58;', $matches[1] );
1190  },
1191  $encValue );
1192  return $encValue;
1193  }
1194 
1223  static function escapeId( $id, $options = [] ) {
1224  wfDeprecated( __METHOD__, '1.30' );
1225  $options = (array)$options;
1226 
1227  // HTML4-style escaping
1228  static $replace = [
1229  '%3A' => ':',
1230  '%' => '.'
1231  ];
1232 
1233  $id = urlencode( strtr( $id, ' ', '_' ) );
1234  $id = strtr( $id, $replace );
1235 
1236  if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) {
1237  // Initial character must be a letter!
1238  $id = "x$id";
1239  }
1240  return $id;
1241  }
1242 
1258  public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
1259  global $wgFragmentMode;
1260 
1261  if ( !isset( $wgFragmentMode[$mode] ) ) {
1262  if ( $mode === self::ID_PRIMARY ) {
1263  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
1264  }
1265  return false;
1266  }
1267 
1268  $internalMode = $wgFragmentMode[$mode];
1269 
1270  return self::escapeIdInternal( $id, $internalMode );
1271  }
1272 
1285  public static function escapeIdForLink( $id ) {
1286  global $wgFragmentMode;
1287 
1288  if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
1289  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
1290  }
1291 
1292  $mode = $wgFragmentMode[self::ID_PRIMARY];
1293 
1294  $id = self::escapeIdInternal( $id, $mode );
1295 
1296  return $id;
1297  }
1298 
1308  public static function escapeIdForExternalInterwiki( $id ) {
1310 
1311  $id = self::escapeIdInternal( $id, $wgExternalInterwikiFragmentMode );
1312 
1313  return $id;
1314  }
1315 
1323  private static function escapeIdInternal( $id, $mode ) {
1324  switch ( $mode ) {
1325  case 'html5':
1326  $id = str_replace( ' ', '_', $id );
1327  break;
1328  case 'legacy':
1329  // This corresponds to 'noninitial' mode of the old escapeId()
1330  static $replace = [
1331  '%3A' => ':',
1332  '%' => '.'
1333  ];
1334 
1335  $id = urlencode( str_replace( ' ', '_', $id ) );
1336  $id = strtr( $id, $replace );
1337  break;
1338  default:
1339  throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1340  }
1341 
1342  return $id;
1343  }
1344 
1354  public static function escapeIdReferenceList( $referenceString ) {
1355  # Explode the space delimited list string into an array of tokens
1356  $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1357 
1358  # Escape each token as an id
1359  foreach ( $references as &$ref ) {
1360  $ref = self::escapeIdForAttribute( $ref );
1361  }
1362 
1363  # Merge the array back to a space delimited list string
1364  # If the array is empty, the result will be an empty string ('')
1365  $referenceString = implode( ' ', $references );
1366 
1367  return $referenceString;
1368  }
1369 
1381  static function escapeClass( $class ) {
1382  // Convert ugly stuff to underscores and kill underscores in ugly places
1383  return rtrim( preg_replace(
1384  [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1385  '_',
1386  $class ), '_' );
1387  }
1388 
1396  static function escapeHtmlAllowEntities( $html ) {
1397  $html = self::decodeCharReferences( $html );
1398  # It seems wise to escape ' as well as ", as a matter of course. Can't
1399  # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1400  # don't cause the entire string to disappear.
1401  $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1402  return $html;
1403  }
1404 
1413  public static function decodeTagAttributes( $text ) {
1414  if ( trim( $text ) == '' ) {
1415  return [];
1416  }
1417 
1418  $pairs = [];
1419  if ( !preg_match_all(
1420  self::getAttribsRegex(),
1421  $text,
1422  $pairs,
1423  PREG_SET_ORDER ) ) {
1424  return [];
1425  }
1426 
1427  $attribs = [];
1428  foreach ( $pairs as $set ) {
1429  $attribute = strtolower( $set[1] );
1430 
1431  // Filter attribute names with unacceptable characters
1432  if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1433  continue;
1434  }
1435 
1436  $value = self::getTagAttributeCallback( $set );
1437 
1438  // Normalize whitespace
1439  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1440  $value = trim( $value );
1441 
1442  // Decode character references
1443  $attribs[$attribute] = self::decodeCharReferences( $value );
1444  }
1445  return $attribs;
1446  }
1447 
1455  public static function safeEncodeTagAttributes( $assoc_array ) {
1456  $attribs = [];
1457  foreach ( $assoc_array as $attribute => $value ) {
1458  $encAttribute = htmlspecialchars( $attribute );
1459  $encValue = self::safeEncodeAttribute( $value );
1460 
1461  $attribs[] = "$encAttribute=\"$encValue\"";
1462  }
1463  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1464  }
1465 
1474  private static function getTagAttributeCallback( $set ) {
1475  if ( isset( $set[5] ) ) {
1476  # No quotes.
1477  return $set[5];
1478  } elseif ( isset( $set[4] ) ) {
1479  # Single-quoted
1480  return $set[4];
1481  } elseif ( isset( $set[3] ) ) {
1482  # Double-quoted
1483  return $set[3];
1484  } elseif ( !isset( $set[2] ) ) {
1485  # In XHTML, attributes must have a value so return an empty string.
1486  # See "Empty attribute syntax",
1487  # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1488  return "";
1489  } else {
1490  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1491  }
1492  }
1493 
1498  private static function normalizeWhitespace( $text ) {
1499  return trim( preg_replace(
1500  '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1501  ' ',
1502  $text ) );
1503  }
1504 
1513  static function normalizeSectionNameWhitespace( $section ) {
1514  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1515  }
1516 
1532  static function normalizeCharReferences( $text ) {
1533  return preg_replace_callback(
1534  self::CHAR_REFS_REGEX,
1535  [ self::class, 'normalizeCharReferencesCallback' ],
1536  $text );
1537  }
1538 
1543  static function normalizeCharReferencesCallback( $matches ) {
1544  $ret = null;
1545  if ( $matches[1] != '' ) {
1546  $ret = self::normalizeEntity( $matches[1] );
1547  } elseif ( $matches[2] != '' ) {
1548  $ret = self::decCharReference( $matches[2] );
1549  } elseif ( $matches[3] != '' ) {
1550  $ret = self::hexCharReference( $matches[3] );
1551  }
1552  if ( $ret === null ) {
1553  return htmlspecialchars( $matches[0] );
1554  } else {
1555  return $ret;
1556  }
1557  }
1558 
1569  static function normalizeEntity( $name ) {
1570  if ( isset( self::HTML_ENTITY_ALIASES[$name] ) ) {
1571  return '&' . self::HTML_ENTITY_ALIASES[$name] . ';';
1572  } elseif ( in_array( $name, [ 'lt', 'gt', 'amp', 'quot' ] ) ) {
1573  return "&$name;";
1574  } elseif ( isset( self::HTML_ENTITIES[$name] ) ) {
1575  return '&#' . self::HTML_ENTITIES[$name] . ';';
1576  } else {
1577  return "&amp;$name;";
1578  }
1579  }
1580 
1585  static function decCharReference( $codepoint ) {
1586  $point = intval( $codepoint );
1587  if ( self::validateCodepoint( $point ) ) {
1588  return sprintf( '&#%d;', $point );
1589  } else {
1590  return null;
1591  }
1592  }
1593 
1598  static function hexCharReference( $codepoint ) {
1599  $point = hexdec( $codepoint );
1600  if ( self::validateCodepoint( $point ) ) {
1601  return sprintf( '&#x%x;', $point );
1602  } else {
1603  return null;
1604  }
1605  }
1606 
1613  private static function validateCodepoint( $codepoint ) {
1614  # U+000C is valid in HTML5 but not allowed in XML.
1615  # U+000D is valid in XML but not allowed in HTML5.
1616  # U+007F - U+009F are disallowed in HTML5 (control characters).
1617  return $codepoint == 0x09
1618  || $codepoint == 0x0a
1619  || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1620  || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1621  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1622  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1623  }
1624 
1632  public static function decodeCharReferences( $text ) {
1633  return preg_replace_callback(
1634  self::CHAR_REFS_REGEX,
1635  [ self::class, 'decodeCharReferencesCallback' ],
1636  $text );
1637  }
1638 
1649  public static function decodeCharReferencesAndNormalize( $text ) {
1650  $text = preg_replace_callback(
1651  self::CHAR_REFS_REGEX,
1652  [ self::class, 'decodeCharReferencesCallback' ],
1653  $text,
1654  -1, //limit
1655  $count
1656  );
1657 
1658  if ( $count ) {
1659  return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1660  } else {
1661  return $text;
1662  }
1663  }
1664 
1669  static function decodeCharReferencesCallback( $matches ) {
1670  if ( $matches[1] != '' ) {
1671  return self::decodeEntity( $matches[1] );
1672  } elseif ( $matches[2] != '' ) {
1673  return self::decodeChar( intval( $matches[2] ) );
1674  } elseif ( $matches[3] != '' ) {
1675  return self::decodeChar( hexdec( $matches[3] ) );
1676  }
1677  # Last case should be an ampersand by itself
1678  return $matches[0];
1679  }
1680 
1688  static function decodeChar( $codepoint ) {
1689  if ( self::validateCodepoint( $codepoint ) ) {
1690  return UtfNormal\Utils::codepointToUtf8( $codepoint );
1691  } else {
1692  return UtfNormal\Constants::UTF8_REPLACEMENT;
1693  }
1694  }
1695 
1704  static function decodeEntity( $name ) {
1705  if ( isset( self::HTML_ENTITY_ALIASES[$name] ) ) {
1706  $name = self::HTML_ENTITY_ALIASES[$name];
1707  }
1708  if ( isset( self::HTML_ENTITIES[$name] ) ) {
1709  return UtfNormal\Utils::codepointToUtf8( self::HTML_ENTITIES[$name] );
1710  } else {
1711  return "&$name;";
1712  }
1713  }
1714 
1722  private static function attributeWhitelistInternal( $element ) {
1723  $list = self::setupAttributeWhitelistInternal();
1724  return $list[$element] ?? [];
1725  }
1726 
1734  private static function setupAttributeWhitelistInternal() {
1735  static $whitelist;
1736 
1737  if ( $whitelist !== null ) {
1738  return $whitelist;
1739  }
1740 
1741  // For lookup efficiency flip each attributes array so the keys are
1742  // the valid attributes.
1743  $merge = function ( $a, $b, $c = [] ) {
1744  return array_merge( $a, array_flip( $b ), array_flip( $c ) );
1745  };
1746  $common = $merge( [], [
1747  # HTML
1748  'id',
1749  'class',
1750  'style',
1751  'lang',
1752  'dir',
1753  'title',
1754 
1755  # WAI-ARIA
1756  'aria-describedby',
1757  'aria-flowto',
1758  'aria-label',
1759  'aria-labelledby',
1760  'aria-owns',
1761  'role',
1762 
1763  # RDFa
1764  # These attributes are specified in section 9 of
1765  # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1766  'about',
1767  'property',
1768  'resource',
1769  'datatype',
1770  'typeof',
1771 
1772  # Microdata. These are specified by
1773  # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1774  'itemid',
1775  'itemprop',
1776  'itemref',
1777  'itemscope',
1778  'itemtype',
1779  ] );
1780 
1781  $block = $merge( $common, [ 'align' ] );
1782 
1783  $tablealign = [ 'align', 'valign' ];
1784  $tablecell = [
1785  'abbr',
1786  'axis',
1787  'headers',
1788  'scope',
1789  'rowspan',
1790  'colspan',
1791  'nowrap', # deprecated
1792  'width', # deprecated
1793  'height', # deprecated
1794  'bgcolor', # deprecated
1795  ];
1796 
1797  # Numbers refer to sections in HTML 4.01 standard describing the element.
1798  # See: https://www.w3.org/TR/html4/
1799  $whitelist = [
1800  # 7.5.4
1801  'div' => $block,
1802  'center' => $common, # deprecated
1803  'span' => $common,
1804 
1805  # 7.5.5
1806  'h1' => $block,
1807  'h2' => $block,
1808  'h3' => $block,
1809  'h4' => $block,
1810  'h5' => $block,
1811  'h6' => $block,
1812 
1813  # 7.5.6
1814  # address
1815 
1816  # 8.2.4
1817  'bdo' => $common,
1818 
1819  # 9.2.1
1820  'em' => $common,
1821  'strong' => $common,
1822  'cite' => $common,
1823  'dfn' => $common,
1824  'code' => $common,
1825  'samp' => $common,
1826  'kbd' => $common,
1827  'var' => $common,
1828  'abbr' => $common,
1829  # acronym
1830 
1831  # 9.2.2
1832  'blockquote' => $merge( $common, [ 'cite' ] ),
1833  'q' => $merge( $common, [ 'cite' ] ),
1834 
1835  # 9.2.3
1836  'sub' => $common,
1837  'sup' => $common,
1838 
1839  # 9.3.1
1840  'p' => $block,
1841 
1842  # 9.3.2
1843  'br' => $merge( $common, [ 'clear' ] ),
1844 
1845  # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1846  'wbr' => $common,
1847 
1848  # 9.3.4
1849  'pre' => $merge( $common, [ 'width' ] ),
1850 
1851  # 9.4
1852  'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1853  'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1854 
1855  # 10.2
1856  'ul' => $merge( $common, [ 'type' ] ),
1857  'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1858  'li' => $merge( $common, [ 'type', 'value' ] ),
1859 
1860  # 10.3
1861  'dl' => $common,
1862  'dd' => $common,
1863  'dt' => $common,
1864 
1865  # 11.2.1
1866  'table' => $merge( $common,
1867  [ 'summary', 'width', 'border', 'frame',
1868  'rules', 'cellspacing', 'cellpadding',
1869  'align', 'bgcolor',
1870  ] ),
1871 
1872  # 11.2.2
1873  'caption' => $block,
1874 
1875  # 11.2.3
1876  'thead' => $common,
1877  'tfoot' => $common,
1878  'tbody' => $common,
1879 
1880  # 11.2.4
1881  'colgroup' => $merge( $common, [ 'span' ] ),
1882  'col' => $merge( $common, [ 'span' ] ),
1883 
1884  # 11.2.5
1885  'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1886 
1887  # 11.2.6
1888  'td' => $merge( $common, $tablecell, $tablealign ),
1889  'th' => $merge( $common, $tablecell, $tablealign ),
1890 
1891  # 12.2
1892  # NOTE: <a> is not allowed directly, but the attrib
1893  # whitelist is used from the Parser object
1894  'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1895 
1896  # 13.2
1897  # Not usually allowed, but may be used for extension-style hooks
1898  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1899  # true
1900  'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1901  # Attributes for A/V tags added in T163583 / T133673
1902  'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1903  'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1904  'source' => $merge( $common, [ 'type', 'src' ] ),
1905  'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1906 
1907  # 15.2.1
1908  'tt' => $common,
1909  'b' => $common,
1910  'i' => $common,
1911  'big' => $common,
1912  'small' => $common,
1913  'strike' => $common,
1914  's' => $common,
1915  'u' => $common,
1916 
1917  # 15.2.2
1918  'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1919  # basefont
1920 
1921  # 15.3
1922  'hr' => $merge( $common, [ 'width' ] ),
1923 
1924  # HTML Ruby annotation text module, simple ruby only.
1925  # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1926  'ruby' => $common,
1927  # rbc
1928  'rb' => $common,
1929  'rp' => $common,
1930  'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1931  'rtc' => $common,
1932 
1933  # MathML root element, where used for extensions
1934  # 'title' may not be 100% valid here; it's XHTML
1935  # https://www.w3.org/TR/REC-MathML/
1936  'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1937 
1938  // HTML 5 section 4.5
1939  'figure' => $common,
1940  'figure-inline' => $common, # T118520
1941  'figcaption' => $common,
1942 
1943  # HTML 5 section 4.6
1944  'bdi' => $common,
1945 
1946  # HTML5 elements, defined by:
1947  # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1948  'data' => $merge( $common, [ 'value' ] ),
1949  'time' => $merge( $common, [ 'datetime' ] ),
1950  'mark' => $common,
1951 
1952  // meta and link are only permitted by removeHTMLtags when Microdata
1953  // is enabled so we don't bother adding a conditional to hide these
1954  // Also meta and link are only valid in WikiText as Microdata elements
1955  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1956  // So we don't bother including $common attributes that have no purpose.
1957  'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1958  'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1959  ];
1960 
1961  return $whitelist;
1962  }
1963 
1975  static function stripAllTags( $html ) {
1976  // Use RemexHtml to tokenize $html and extract the text
1977  $handler = new RemexStripTagHandler;
1978  $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
1979  'ignoreErrors' => true,
1980  // don't ignore char refs, we want them to be decoded
1981  'ignoreNulls' => true,
1982  'skipPreprocess' => true,
1983  ] );
1984  $tokenizer->execute();
1985  $text = $handler->getResult();
1986 
1987  $text = self::normalizeWhitespace( $text );
1988  return $text;
1989  }
1990 
2000  static function hackDocType() {
2001  $out = "<!DOCTYPE html [\n";
2002  foreach ( self::HTML_ENTITIES as $entity => $codepoint ) {
2003  $out .= "<!ENTITY $entity \"&#$codepoint;\">";
2004  }
2005  $out .= "]>\n";
2006  return $out;
2007  }
2008 
2013  static function cleanUrl( $url ) {
2014  # Normalize any HTML entities in input. They will be
2015  # re-escaped by makeExternalLink().
2016  $url = self::decodeCharReferences( $url );
2017 
2018  # Escape any control characters introduced by the above step
2019  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
2020  [ __CLASS__, 'cleanUrlCallback' ], $url );
2021 
2022  # Validate hostname portion
2023  $matches = [];
2024  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
2025  list( /* $whole */, $protocol, $host, $rest ) = $matches;
2026 
2027  // Characters that will be ignored in IDNs.
2028  // https://tools.ietf.org/html/rfc3454#section-3.1
2029  // Strip them before further processing so blacklists and such work.
2030  $strip = "/
2031  \\s| # general whitespace
2032  \xc2\xad| # 00ad SOFT HYPHEN
2033  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
2034  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
2035  \xe2\x81\xa0| # 2060 WORD JOINER
2036  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
2037  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
2038  \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
2039  \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
2040  \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
2041  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
2042  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
2043  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
2044  /xuD";
2045 
2046  $host = preg_replace( $strip, '', $host );
2047 
2048  // IPv6 host names are bracketed with []. Url-decode these.
2049  if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 &&
2050  preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
2051  ) {
2052  $host = '//[' . $matches[1] . ']' . $matches[2];
2053  }
2054 
2055  // @todo FIXME: Validate hostnames here
2056 
2057  return $protocol . $host . $rest;
2058  } else {
2059  return $url;
2060  }
2061  }
2062 
2067  static function cleanUrlCallback( $matches ) {
2068  return urlencode( $matches[0] );
2069  }
2070 
2099  public static function validateEmail( $addr ) {
2100  $result = null;
2101  if ( !Hooks::run( 'isValidEmailAddr', [ $addr, &$result ] ) ) {
2102  return $result;
2103  }
2104 
2105  // Please note strings below are enclosed in brackets [], this make the
2106  // hyphen "-" a range indicator. Hence it is double backslashed below.
2107  // See T28948
2108  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
2109  $rfc1034_ldh_str = "a-z0-9\\-";
2110 
2111  $html5_email_regexp = "/
2112  ^ # start of string
2113  [$rfc5322_atext\\.]+ # user part which is liberal :p
2114  @ # 'apostrophe'
2115  [$rfc1034_ldh_str]+ # First domain part
2116  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
2117  $ # End of string
2118  /ix"; // case Insensitive, eXtended
2119 
2120  return (bool)preg_match( $html5_email_regexp, $addr );
2121  }
2122 }
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:130
$wgExternalInterwikiFragmentMode
$wgExternalInterwikiFragmentMode
Which ID escaping mode should be used for external interwiki links? See documentation for $wgFragment...
Definition: DefaultSettings.php:3458
$wgFragmentMode
$wgFragmentMode
How should section IDs be encoded? This array can contain 1 or 2 elements, each of them can be one of...
Definition: DefaultSettings.php:3448
MWTidy\isEnabled
static isEnabled()
Definition: MWTidy.php:54
RemexStripTagHandler
Definition: RemexStripTagHandler.php:9
$wgAllowImageTag
$wgAllowImageTag
A different approach to the above: simply allow the "<img>" tag to be used.
Definition: DefaultSettings.php:4227
MWException
MediaWiki exception.
Definition: MWException.php:26
wfDeprecated
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Throws a warning that $function is deprecated.
Definition: GlobalFunctions.php:1044
$matches
$matches
Definition: NoLocalSettings.php:24
$args
if( $line===false) $args
Definition: mcc.php:124
wfUrlProtocols
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
Definition: GlobalFunctions.php:719
StringUtils\delimiterReplace
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
Definition: StringUtils.php:248
$t
$t
Definition: testCompression.php:71
Hooks\run
static run( $event, array $args=[], $deprecatedVersion=null)
Call hook functions defined in Hooks::register and $wgHooks.
Definition: Hooks.php:200