MediaWiki  master
Sanitizer.php
Go to the documentation of this file.
1 <?php
28 use RemexHtml\HTMLData;
29 
34 class Sanitizer {
41  private const CHAR_REFS_REGEX =
42  '/&([A-Za-z0-9\x80-\xff]+;)
43  |&\#([0-9]+);
44  |&\#[xX]([0-9A-Fa-f]+);
45  |(&)/x';
46 
51  private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
52 
62  private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
63  private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
64 
70  public const ID_PRIMARY = 0;
71 
78  public const ID_FALLBACK = 1;
79 
84  private const MW_ENTITY_ALIASES = [
85  'רלמ;' => 'rlm;',
86  'رلم;' => 'rlm;',
87  ];
88 
92  private static $attribsRegex;
93 
100  private static function getAttribsRegex() {
101  if ( self::$attribsRegex === null ) {
102  $spaceChars = '\x09\x0a\x0c\x0d\x20';
103  $space = "[{$spaceChars}]";
104  $attrib = "[^{$spaceChars}\/>=]";
105  $attribFirst = "(?:{$attrib}|=)";
106  self::$attribsRegex =
107  "/({$attribFirst}{$attrib}*)
108  ($space*=$space*
109  (?:
110  # The attribute value: quoted or alone
111  \"([^\"]*)(?:\"|\$)
112  | '([^']*)(?:'|\$)
113  | (((?!$space|>).)*)
114  )
115  )?/sxu";
116  }
117  return self::$attribsRegex;
118  }
119 
123  private static $attribNameRegex;
124 
129  private static function getAttribNameRegex() {
130  if ( self::$attribNameRegex === null ) {
131  $attribFirst = "[:_\p{L}\p{N}]";
132  $attrib = "[:_\.\-\p{L}\p{N}]";
133  self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
134  }
135  return self::$attribNameRegex;
136  }
137 
144  public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
145  global $wgAllowImageTag;
146 
147  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
148  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
149 
150  // Base our staticInitialised variable off of the global config state so that if the globals
151  // are changed (like in the screwed up test system) we will re-initialise the settings.
152  $globalContext = $wgAllowImageTag;
153  if ( !$staticInitialised || $staticInitialised != $globalContext ) {
154  $htmlpairsStatic = [ # Tags that must be closed
155  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
156  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
157  'strike', 'strong', 'tt', 'var', 'div', 'center',
158  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
159  'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
160  'kbd', 'samp', 'data', 'time', 'mark'
161  ];
162  $htmlsingle = [
163  'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
164  ];
165 
166  # Elements that cannot have close tags. This is (not coincidentally)
167  # also the list of tags for which the HTML 5 parsing algorithm
168  # requires you to "acknowledge the token's self-closing flag", i.e.
169  # a self-closing tag like <br/> is not an HTML 5 parse error only
170  # for this list.
171  $htmlsingleonly = [
172  'br', 'wbr', 'hr', 'meta', 'link'
173  ];
174 
175  $htmlnest = [ # Tags that can be nested--??
176  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
177  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
178  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
179  ];
180  $tabletags = [ # Can only appear inside table, we will close them
181  'td', 'th', 'tr',
182  ];
183  $htmllist = [ # Tags used by list
184  'ul', 'ol',
185  ];
186  $listtags = [ # Tags that can appear in a list
187  'li',
188  ];
189 
190  if ( $wgAllowImageTag ) {
191  wfDeprecatedMsg( 'Setting $wgAllowImageTag to true ' .
192  'is deprecated since MediaWiki 1.35', '1.35', false, false );
193  $htmlsingle[] = 'img';
194  $htmlsingleonly[] = 'img';
195  }
196 
197  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
198  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
199 
200  # Convert them all to hashtables for faster lookup
201  $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
202  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
203  foreach ( $vars as $var ) {
204  $$var = array_flip( $$var );
205  }
206  $staticInitialised = $globalContext;
207  }
208 
209  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
210  $extratags = array_flip( $extratags );
211  $removetags = array_flip( $removetags );
212  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
213  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
214 
215  return [
216  'htmlpairs' => $htmlpairs,
217  'htmlsingle' => $htmlsingle,
218  'htmlsingleonly' => $htmlsingleonly,
219  'htmlnest' => $htmlnest,
220  'tabletags' => $tabletags,
221  'htmllist' => $htmllist,
222  'listtags' => $listtags,
223  'htmlsingleallowed' => $htmlsingleallowed,
224  'htmlelements' => $htmlelements,
225  ];
226  }
227 
239  public static function removeHTMLtags( $text, $processCallback = null,
240  $args = [], $extratags = [], $removetags = []
241  ) {
242  $tagData = self::getRecognizedTagData( $extratags, $removetags );
243  $htmlpairs = $tagData['htmlpairs'];
244  $htmlsingle = $tagData['htmlsingle'];
245  $htmlsingleonly = $tagData['htmlsingleonly'];
246  $htmlnest = $tagData['htmlnest'];
247  $tabletags = $tagData['tabletags'];
248  $htmllist = $tagData['htmllist'];
249  $listtags = $tagData['listtags'];
250  $htmlsingleallowed = $tagData['htmlsingleallowed'];
251  $htmlelements = $tagData['htmlelements'];
252 
253  # Remove HTML comments
254  $text = self::removeHTMLcomments( $text );
255  $bits = explode( '<', $text );
256  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
257 
258  # this might be possible using remex tidy itself
259  foreach ( $bits as $x ) {
260  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
261  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
262 
263  $badtag = false;
264  $t = strtolower( $t );
265  if ( isset( $htmlelements[$t] ) ) {
266  if ( is_callable( $processCallback ) ) {
267  call_user_func_array( $processCallback, [ &$params, $args ] );
268  }
269 
270  if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
271  // Remove the self-closing slash, to be consistent
272  // with HTML5 semantics. T134423
273  $brace = '>';
274  }
275  if ( !self::validateTag( $params, $t ) ) {
276  $badtag = true;
277  }
278 
279  $newparams = self::fixTagAttributes( $params, $t );
280  if ( !$badtag ) {
281  if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
282  # Interpret self-closing tags as empty tags even when
283  # HTML 5 would interpret them as start tags. Such input
284  # is commonly seen on Wikimedia wikis with this intention.
285  $brace = "></$t>";
286  }
287 
288  $rest = str_replace( '>', '&gt;', $rest );
289  $text .= "<$slash$t$newparams$brace$rest";
290  continue;
291  }
292  }
293  }
294  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
295  }
296  return $text;
297  }
298 
308  public static function removeHTMLcomments( $text ) {
309  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
310  $end = strpos( $text, '-->', $start + 4 );
311  if ( $end === false ) {
312  # Unterminated comment; bail out
313  break;
314  }
315 
316  $end += 3;
317 
318  # Trim space and newline if the comment is both
319  # preceded and followed by a newline
320  $spaceStart = max( $start - 1, 0 );
321  $spaceLen = $end - $spaceStart;
322  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
323  $spaceStart--;
324  $spaceLen++;
325  }
326  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
327  $spaceLen++;
328  }
329  if ( substr( $text, $spaceStart, 1 ) === "\n"
330  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
331  # Remove the comment, leading and trailing
332  # spaces, and leave only one newline.
333  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
334  } else {
335  # Remove just the comment.
336  $text = substr_replace( $text, '', $start, $end - $start );
337  }
338  }
339  return $text;
340  }
341 
354  private static function validateTag( $params, $element ) {
355  $params = self::decodeTagAttributes( $params );
356 
357  if ( $element == 'meta' || $element == 'link' ) {
358  if ( !isset( $params['itemprop'] ) ) {
359  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
360  return false;
361  }
362  if ( $element == 'meta' && !isset( $params['content'] ) ) {
363  // <meta> must have a content="" for the itemprop
364  return false;
365  }
366  if ( $element == 'link' && !isset( $params['href'] ) ) {
367  // <link> must have an associated href=""
368  return false;
369  }
370  }
371 
372  return true;
373  }
374 
390  public static function validateTagAttributes( $attribs, $element ) {
391  return self::validateAttributes( $attribs,
392  self::attributesAllowedInternal( $element ) );
393  }
394 
413  public static function validateAttributes( $attribs, $allowed ) {
414  if ( isset( $allowed[0] ) ) {
415  // Calling this function with a sequential array is
416  // deprecated. For now just convert it.
417  wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
418  $allowed = array_flip( $allowed );
419  }
420  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
421 
422  $out = [];
423  foreach ( $attribs as $attribute => $value ) {
424  # Allow XML namespace declaration to allow RDFa
425  if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
426  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
427  $out[$attribute] = $value;
428  }
429 
430  continue;
431  }
432 
433  # Allow any attribute beginning with "data-"
434  # However:
435  # * Disallow data attributes used by MediaWiki code
436  # * Ensure that the attribute is not namespaced by banning
437  # colons.
438  if ( (
439  !preg_match( '/^data-[^:]*$/i', $attribute ) &&
440  !array_key_exists( $attribute, $allowed )
441  ) || self::isReservedDataAttribute( $attribute ) ) {
442  continue;
443  }
444 
445  # Strip javascript "expression" from stylesheets.
446  # https://msdn.microsoft.com/en-us/library/ms537634.aspx
447  if ( $attribute == 'style' ) {
448  $value = self::checkCss( $value );
449  }
450 
451  # Escape HTML id attributes
452  if ( $attribute === 'id' ) {
453  $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
454  }
455 
456  # Escape HTML id reference lists
457  if ( $attribute === 'aria-describedby'
458  || $attribute === 'aria-flowto'
459  || $attribute === 'aria-labelledby'
460  || $attribute === 'aria-owns'
461  ) {
462  $value = self::escapeIdReferenceListInternal( $value );
463  }
464 
465  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
466  // Check them for sanity.
467  if ( $attribute === 'rel' || $attribute === 'rev'
468  # RDFa
469  || $attribute === 'about' || $attribute === 'property'
470  || $attribute === 'resource' || $attribute === 'datatype'
471  || $attribute === 'typeof'
472  # HTML5 microdata
473  || $attribute === 'itemid' || $attribute === 'itemprop'
474  || $attribute === 'itemref' || $attribute === 'itemscope'
475  || $attribute === 'itemtype'
476  ) {
477  // Paranoia. Allow "simple" values but suppress javascript
478  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
479  continue;
480  }
481  }
482 
483  # NOTE: even though elements using href/src are not allowed directly, supply
484  # validation code that can be used by tag hook handlers, etc
485  if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
486  if ( !preg_match( $hrefExp, $value ) ) {
487  continue; // drop any href or src attributes not using an allowed protocol.
488  // NOTE: this also drops all relative URLs
489  }
490  }
491 
492  if ( $attribute === 'tabindex' && $value !== '0' ) {
493  // Only allow tabindex of 0, which is useful for accessibility.
494  continue;
495  }
496 
497  // If this attribute was previously set, override it.
498  // Output should only have one attribute of each name.
499  $out[$attribute] = $value;
500  }
501 
502  # itemtype, itemid, itemref don't make sense without itemscope
503  if ( !array_key_exists( 'itemscope', $out ) ) {
504  unset( $out['itemtype'] );
505  unset( $out['itemid'] );
506  unset( $out['itemref'] );
507  }
508  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
509 
510  return $out;
511  }
512 
520  public static function isReservedDataAttribute( $attr ) {
521  // data-ooui is reserved for ooui.
522  // data-mw and data-parsoid are reserved for parsoid.
523  // data-mw-<name here> is reserved for extensions (or core) if
524  // they need to communicate some data to the client and want to be
525  // sure that it isn't coming from an untrusted user.
526  // We ignore the possibility of namespaces since user-generated HTML
527  // can't use them anymore.
528  return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
529  }
530 
541  public static function mergeAttributes( $a, $b ) {
542  $out = array_merge( $a, $b );
543  if ( isset( $a['class'] ) && isset( $b['class'] )
544  && is_string( $a['class'] ) && is_string( $b['class'] )
545  && $a['class'] !== $b['class']
546  ) {
547  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
548  -1, PREG_SPLIT_NO_EMPTY );
549  $out['class'] = implode( ' ', array_unique( $classes ) );
550  }
551  return $out;
552  }
553 
562  public static function normalizeCss( $value ) {
563  // Decode character references like &#123;
564  $value = self::decodeCharReferences( $value );
565 
566  // Decode escape sequences and line continuation
567  // See the grammar in the CSS 2 spec, appendix D.
568  // This has to be done AFTER decoding character references.
569  // This means it isn't possible for this function to return
570  // unsanitized escape sequences. It is possible to manufacture
571  // input that contains character references that decode to
572  // escape sequences that decode to character references, but
573  // it's OK for the return value to contain character references
574  // because the caller is supposed to escape those anyway.
575  static $decodeRegex;
576  if ( !$decodeRegex ) {
577  $space = '[\\x20\\t\\r\\n\\f]';
578  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
579  $backslash = '\\\\';
580  $decodeRegex = "/ $backslash
581  (?:
582  ($nl) | # 1. Line continuation
583  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
584  (.) | # 3. backslash cancelling special meaning
585  () | # 4. backslash at end of string
586  )/xu";
587  }
588  $value = preg_replace_callback( $decodeRegex,
589  [ __CLASS__, 'cssDecodeCallback' ], $value );
590 
591  // Let the value through if it's nothing but a single comment, to
592  // allow other functions which may reject it to pass some error
593  // message through.
594  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
595  // Remove any comments; IE gets token splitting wrong
596  // This must be done AFTER decoding character references and
597  // escape sequences, because those steps can introduce comments
598  // This step cannot introduce character references or escape
599  // sequences, because it replaces comments with spaces rather
600  // than removing them completely.
601  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
602 
603  // Remove anything after a comment-start token, to guard against
604  // incorrect client implementations.
605  $commentPos = strpos( $value, '/*' );
606  if ( $commentPos !== false ) {
607  $value = substr( $value, 0, $commentPos );
608  }
609  }
610 
611  return $value;
612  }
613 
632  public static function checkCss( $value ) {
633  $value = self::normalizeCss( $value );
634 
635  // Reject problematic keywords and control characters
636  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
637  strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
638  return '/* invalid control char */';
639  } elseif ( preg_match(
640  '! expression
641  | filter\s*:
642  | accelerator\s*:
643  | -o-link\s*:
644  | -o-link-source\s*:
645  | -o-replace\s*:
646  | url\s*\‍(
647  | image\s*\‍(
648  | image-set\s*\‍(
649  | attr\s*\‍([^)]+[\s,]+url
650  | var\s*\‍(
651  !ix', $value ) ) {
652  return '/* insecure input */';
653  }
654  return $value;
655  }
656 
661  private static function cssDecodeCallback( $matches ) {
662  if ( $matches[1] !== '' ) {
663  // Line continuation
664  return '';
665  } elseif ( $matches[2] !== '' ) {
666  $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
667  } elseif ( $matches[3] !== '' ) {
668  $char = $matches[3];
669  } else {
670  $char = '\\';
671  }
672  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
673  // These characters need to be escaped in strings
674  // Clean up the escape sequence to avoid parsing errors by clients
675  return '\\' . dechex( ord( $char ) ) . ' ';
676  } else {
677  // Decode unnecessary escape
678  return $char;
679  }
680  }
681 
703  public static function fixTagAttributes( $text, $element, $sorted = false ) {
704  if ( trim( $text ) == '' ) {
705  return '';
706  }
707 
708  $decoded = self::decodeTagAttributes( $text );
709  $stripped = self::validateTagAttributes( $decoded, $element );
710 
711  if ( $sorted ) {
712  ksort( $stripped );
713  }
714 
715  return self::safeEncodeTagAttributes( $stripped );
716  }
717 
723  public static function encodeAttribute( $text ) {
724  $encValue = htmlspecialchars( $text, ENT_QUOTES );
725 
726  // Whitespace is normalized during attribute decoding,
727  // so if we've been passed non-spaces we must encode them
728  // ahead of time or they won't be preserved.
729  $encValue = strtr( $encValue, [
730  "\n" => '&#10;',
731  "\r" => '&#13;',
732  "\t" => '&#9;',
733  ] );
734 
735  return $encValue;
736  }
737 
746  public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
747  // Replace $ with \$ and \ with \\
748  $space = preg_replace( '#(?<!\\\\‍)(\\$|\\\\‍)#', '\\\\$1', $space );
749  $fixtags = [
750  # French spaces, last one Guillemet-left
751  # only if it isn't followed by a word character.
752  '/ (?=[?:;!%»›](?!\w))/u' => "$space",
753  # French spaces, Guillemet-right
754  '/([«‹]) /u' => "\\1$space",
755  ];
756  return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
757  }
758 
765  public static function safeEncodeAttribute( $text ) {
766  $encValue = self::encodeAttribute( $text );
767 
768  # Templates and links may be expanded in later parsing,
769  # creating invalid or dangerous output. Suppress this.
770  $encValue = strtr( $encValue, [
771  '<' => '&lt;', // This should never happen,
772  '>' => '&gt;', // we've received invalid input
773  '"' => '&quot;', // which should have been escaped.
774  '{' => '&#123;',
775  '}' => '&#125;', // prevent unpaired language conversion syntax
776  '[' => '&#91;',
777  ']' => '&#93;',
778  "''" => '&#39;&#39;',
779  'ISBN' => '&#73;SBN',
780  'RFC' => '&#82;FC',
781  'PMID' => '&#80;MID',
782  '|' => '&#124;',
783  '__' => '&#95;_',
784  ] );
785 
786  # Stupid hack
787  $encValue = preg_replace_callback(
788  '/((?i)' . wfUrlProtocols() . ')/',
789  static function ( $matches ) {
790  return str_replace( ':', '&#58;', $matches[1] );
791  },
792  $encValue );
793  return $encValue;
794  }
795 
811  public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
812  global $wgFragmentMode;
813 
814  if ( !isset( $wgFragmentMode[$mode] ) ) {
815  if ( $mode === self::ID_PRIMARY ) {
816  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
817  }
818  return false;
819  }
820 
821  $internalMode = $wgFragmentMode[$mode];
822 
823  return self::escapeIdInternal( $id, $internalMode );
824  }
825 
838  public static function escapeIdForLink( $id ) {
839  global $wgFragmentMode;
840 
841  if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
842  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
843  }
844 
846 
847  $id = self::escapeIdInternalUrl( $id, $mode );
848 
849  return $id;
850  }
851 
861  public static function escapeIdForExternalInterwiki( $id ) {
863 
865 
866  return $id;
867  }
868 
878  private static function escapeIdInternalUrl( $id, $mode ) {
879  $id = self::escapeIdInternal( $id, $mode );
880  if ( $mode === 'html5' ) {
881  $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
882  }
883  return $id;
884  }
885 
893  private static function escapeIdInternal( $id, $mode ) {
894  // Truncate overly-long IDs. This isn't an HTML limit, it's just
895  // griefer protection. [T251506]
896  $id = mb_substr( $id, 0, 1024 );
897 
898  switch ( $mode ) {
899  case 'html5':
900  // html5 spec says ids must not have any of the following:
901  // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
902  // In practice, in wikitext, only tab, LF, CR (and SPACE) are
903  // possible using either Lua or html entities.
904  $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
905  break;
906  case 'legacy':
907  // This corresponds to 'noninitial' mode of the former escapeId()
908  static $replace = [
909  '%3A' => ':',
910  '%' => '.'
911  ];
912 
913  $id = urlencode( str_replace( ' ', '_', $id ) );
914  $id = strtr( $id, $replace );
915  break;
916  default:
917  throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
918  }
919 
920  return $id;
921  }
922 
933  public static function escapeIdReferenceList( $referenceString ) {
934  wfDeprecated( __METHOD__, '1.36' );
935  return self::escapeIdReferenceListInternal( $referenceString );
936  }
937 
945  private static function escapeIdReferenceListInternal( $referenceString ) {
946  # Explode the space delimited list string into an array of tokens
947  $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
948 
949  # Escape each token as an id
950  foreach ( $references as &$ref ) {
951  $ref = self::escapeIdForAttribute( $ref );
952  }
953 
954  # Merge the array back to a space delimited list string
955  # If the array is empty, the result will be an empty string ('')
956  $referenceString = implode( ' ', $references );
957 
958  return $referenceString;
959  }
960 
972  public static function escapeClass( $class ) {
973  // Convert ugly stuff to underscores and kill underscores in ugly places
974  return rtrim( preg_replace(
975  [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
976  '_',
977  $class ), '_' );
978  }
979 
987  public static function escapeHtmlAllowEntities( $html ) {
988  $html = self::decodeCharReferences( $html );
989  # It seems wise to escape ' as well as ", as a matter of course. Can't
990  # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
991  # don't cause the entire string to disappear.
992  $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
993  return $html;
994  }
995 
1004  public static function decodeTagAttributes( $text ) {
1005  if ( trim( $text ) == '' ) {
1006  return [];
1007  }
1008 
1009  $pairs = [];
1010  if ( !preg_match_all(
1011  self::getAttribsRegex(),
1012  $text,
1013  $pairs,
1014  PREG_SET_ORDER ) ) {
1015  return [];
1016  }
1017 
1018  $attribs = [];
1019  foreach ( $pairs as $set ) {
1020  $attribute = strtolower( $set[1] );
1021 
1022  // Filter attribute names with unacceptable characters
1023  if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1024  continue;
1025  }
1026 
1027  $value = self::getTagAttributeCallback( $set );
1028 
1029  // Normalize whitespace
1030  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1031  $value = trim( $value );
1032 
1033  // Decode character references
1034  $attribs[$attribute] = self::decodeCharReferences( $value );
1035  }
1036  return $attribs;
1037  }
1038 
1046  public static function safeEncodeTagAttributes( $assoc_array ) {
1047  $attribs = [];
1048  foreach ( $assoc_array as $attribute => $value ) {
1049  $encAttribute = htmlspecialchars( $attribute );
1050  $encValue = self::safeEncodeAttribute( $value );
1051 
1052  $attribs[] = "$encAttribute=\"$encValue\"";
1053  }
1054  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1055  }
1056 
1065  private static function getTagAttributeCallback( $set ) {
1066  if ( isset( $set[5] ) ) {
1067  # No quotes.
1068  return $set[5];
1069  } elseif ( isset( $set[4] ) ) {
1070  # Single-quoted
1071  return $set[4];
1072  } elseif ( isset( $set[3] ) ) {
1073  # Double-quoted
1074  return $set[3];
1075  } elseif ( !isset( $set[2] ) ) {
1076  # In XHTML, attributes must have a value so return an empty string.
1077  # See "Empty attribute syntax",
1078  # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1079  return "";
1080  } else {
1081  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1082  }
1083  }
1084 
1089  private static function normalizeWhitespace( $text ) {
1090  return trim( preg_replace(
1091  '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1092  ' ',
1093  $text ) );
1094  }
1095 
1104  public static function normalizeSectionNameWhitespace( $section ) {
1105  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1106  }
1107 
1123  public static function normalizeCharReferences( $text ) {
1124  return preg_replace_callback(
1125  self::CHAR_REFS_REGEX,
1126  [ self::class, 'normalizeCharReferencesCallback' ],
1127  $text );
1128  }
1129 
1134  private static function normalizeCharReferencesCallback( $matches ) {
1135  $ret = null;
1136  if ( $matches[1] != '' ) {
1137  $ret = self::normalizeEntity( $matches[1] );
1138  } elseif ( $matches[2] != '' ) {
1139  $ret = self::decCharReference( $matches[2] );
1140  } elseif ( $matches[3] != '' ) {
1141  $ret = self::hexCharReference( $matches[3] );
1142  }
1143  if ( $ret === null ) {
1144  return htmlspecialchars( $matches[0] );
1145  } else {
1146  return $ret;
1147  }
1148  }
1149 
1160  private static function normalizeEntity( $name ) {
1161  if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1162  // Non-standard MediaWiki-specific entities
1163  return '&' . self::MW_ENTITY_ALIASES[$name];
1164  } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
1165  // Keep these in word form
1166  return "&$name";
1167  } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1168  // Beware: some entities expand to more than 1 codepoint
1169  return preg_replace_callback( '/./Ssu', static function ( $m ) {
1170  return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
1171  }, HTMLData::$namedEntityTranslations[$name] );
1172  } else {
1173  return "&amp;$name";
1174  }
1175  }
1176 
1181  private static function decCharReference( $codepoint ) {
1182  $point = intval( $codepoint );
1183  if ( self::validateCodepoint( $point ) ) {
1184  return sprintf( '&#%d;', $point );
1185  } else {
1186  return null;
1187  }
1188  }
1189 
1194  private static function hexCharReference( $codepoint ) {
1195  $point = hexdec( $codepoint );
1196  if ( self::validateCodepoint( $point ) ) {
1197  return sprintf( '&#x%x;', $point );
1198  } else {
1199  return null;
1200  }
1201  }
1202 
1209  private static function validateCodepoint( $codepoint ) {
1210  # U+000C is valid in HTML5 but not allowed in XML.
1211  # U+000D is valid in XML but not allowed in HTML5.
1212  # U+007F - U+009F are disallowed in HTML5 (control characters).
1213  return $codepoint == 0x09
1214  || $codepoint == 0x0a
1215  || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1216  || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1217  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1218  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1219  }
1220 
1228  public static function decodeCharReferences( $text ) {
1229  return preg_replace_callback(
1230  self::CHAR_REFS_REGEX,
1231  [ self::class, 'decodeCharReferencesCallback' ],
1232  $text );
1233  }
1234 
1245  public static function decodeCharReferencesAndNormalize( $text ) {
1246  $text = preg_replace_callback(
1247  self::CHAR_REFS_REGEX,
1248  [ self::class, 'decodeCharReferencesCallback' ],
1249  $text,
1250  -1, // limit
1251  $count
1252  );
1253 
1254  if ( $count ) {
1255  return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1256  } else {
1257  return $text;
1258  }
1259  }
1260 
1265  private static function decodeCharReferencesCallback( $matches ) {
1266  if ( $matches[1] != '' ) {
1267  return self::decodeEntity( $matches[1] );
1268  } elseif ( $matches[2] != '' ) {
1269  return self::decodeChar( intval( $matches[2] ) );
1270  } elseif ( $matches[3] != '' ) {
1271  return self::decodeChar( hexdec( $matches[3] ) );
1272  }
1273  # Last case should be an ampersand by itself
1274  return $matches[0];
1275  }
1276 
1284  private static function decodeChar( $codepoint ) {
1285  if ( self::validateCodepoint( $codepoint ) ) {
1286  return UtfNormal\Utils::codepointToUtf8( $codepoint );
1287  } else {
1288  return UtfNormal\Constants::UTF8_REPLACEMENT;
1289  }
1290  }
1291 
1300  private static function decodeEntity( $name ) {
1301  // These are MediaWiki-specific entities, not in the HTML standard
1302  if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1303  $name = self::MW_ENTITY_ALIASES[$name];
1304  }
1305  $trans = HTMLData::$namedEntityTranslations[$name] ?? null;
1306  return $trans ?? "&$name";
1307  }
1308 
1316  private static function attributesAllowedInternal( $element ) {
1318  return $list[$element] ?? [];
1319  }
1320 
1328  private static function setupAttributesAllowedInternal() {
1329  static $allowed;
1330 
1331  if ( $allowed !== null ) {
1332  return $allowed;
1333  }
1334 
1335  // For lookup efficiency flip each attributes array so the keys are
1336  // the valid attributes.
1337  $merge = static function ( $a, $b, $c = [] ) {
1338  return array_merge( $a, array_flip( $b ), array_flip( $c ) );
1339  };
1340  $common = $merge( [], [
1341  # HTML
1342  'id',
1343  'class',
1344  'style',
1345  'lang',
1346  'dir',
1347  'title',
1348  'tabindex',
1349 
1350  # WAI-ARIA
1351  'aria-describedby',
1352  'aria-flowto',
1353  'aria-hidden',
1354  'aria-label',
1355  'aria-labelledby',
1356  'aria-owns',
1357  'role',
1358 
1359  # RDFa
1360  # These attributes are specified in section 9 of
1361  # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1362  'about',
1363  'property',
1364  'resource',
1365  'datatype',
1366  'typeof',
1367 
1368  # Microdata. These are specified by
1369  # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1370  'itemid',
1371  'itemprop',
1372  'itemref',
1373  'itemscope',
1374  'itemtype',
1375  ] );
1376 
1377  $block = $merge( $common, [ 'align' ] );
1378 
1379  $tablealign = [ 'align', 'valign' ];
1380  $tablecell = [
1381  'abbr',
1382  'axis',
1383  'headers',
1384  'scope',
1385  'rowspan',
1386  'colspan',
1387  'nowrap', # deprecated
1388  'width', # deprecated
1389  'height', # deprecated
1390  'bgcolor', # deprecated
1391  ];
1392 
1393  # Numbers refer to sections in HTML 4.01 standard describing the element.
1394  # See: https://www.w3.org/TR/html4/
1395  $allowed = [
1396  # 7.5.4
1397  'div' => $block,
1398  'center' => $common, # deprecated
1399  'span' => $common,
1400 
1401  # 7.5.5
1402  'h1' => $block,
1403  'h2' => $block,
1404  'h3' => $block,
1405  'h4' => $block,
1406  'h5' => $block,
1407  'h6' => $block,
1408 
1409  # 7.5.6
1410  # address
1411 
1412  # 8.2.4
1413  'bdo' => $common,
1414 
1415  # 9.2.1
1416  'em' => $common,
1417  'strong' => $common,
1418  'cite' => $common,
1419  'dfn' => $common,
1420  'code' => $common,
1421  'samp' => $common,
1422  'kbd' => $common,
1423  'var' => $common,
1424  'abbr' => $common,
1425  # acronym
1426 
1427  # 9.2.2
1428  'blockquote' => $merge( $common, [ 'cite' ] ),
1429  'q' => $merge( $common, [ 'cite' ] ),
1430 
1431  # 9.2.3
1432  'sub' => $common,
1433  'sup' => $common,
1434 
1435  # 9.3.1
1436  'p' => $block,
1437 
1438  # 9.3.2
1439  'br' => $merge( $common, [ 'clear' ] ),
1440 
1441  # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1442  'wbr' => $common,
1443 
1444  # 9.3.4
1445  'pre' => $merge( $common, [ 'width' ] ),
1446 
1447  # 9.4
1448  'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1449  'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1450 
1451  # 10.2
1452  'ul' => $merge( $common, [ 'type' ] ),
1453  'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1454  'li' => $merge( $common, [ 'type', 'value' ] ),
1455 
1456  # 10.3
1457  'dl' => $common,
1458  'dd' => $common,
1459  'dt' => $common,
1460 
1461  # 11.2.1
1462  'table' => $merge( $common,
1463  [ 'summary', 'width', 'border', 'frame',
1464  'rules', 'cellspacing', 'cellpadding',
1465  'align', 'bgcolor',
1466  ] ),
1467 
1468  # 11.2.2
1469  'caption' => $block,
1470 
1471  # 11.2.3
1472  'thead' => $common,
1473  'tfoot' => $common,
1474  'tbody' => $common,
1475 
1476  # 11.2.4
1477  'colgroup' => $merge( $common, [ 'span' ] ),
1478  'col' => $merge( $common, [ 'span' ] ),
1479 
1480  # 11.2.5
1481  'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1482 
1483  # 11.2.6
1484  'td' => $merge( $common, $tablecell, $tablealign ),
1485  'th' => $merge( $common, $tablecell, $tablealign ),
1486 
1487  # 12.2
1488  # NOTE: <a> is not allowed directly, but this list of allowed
1489  # attributes is used from the Parser object
1490  'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1491 
1492  # 13.2
1493  # Not usually allowed, but may be used for extension-style hooks
1494  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1495  # true
1496  'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1497  # Attributes for A/V tags added in T163583 / T133673
1498  'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1499  'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1500  'source' => $merge( $common, [ 'type', 'src' ] ),
1501  'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1502 
1503  # 15.2.1
1504  'tt' => $common,
1505  'b' => $common,
1506  'i' => $common,
1507  'big' => $common,
1508  'small' => $common,
1509  'strike' => $common,
1510  's' => $common,
1511  'u' => $common,
1512 
1513  # 15.2.2
1514  'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1515  # basefont
1516 
1517  # 15.3
1518  'hr' => $merge( $common, [ 'width' ] ),
1519 
1520  # HTML Ruby annotation text module, simple ruby only.
1521  # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1522  'ruby' => $common,
1523  # rbc
1524  'rb' => $common,
1525  'rp' => $common,
1526  'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1527  'rtc' => $common,
1528 
1529  # MathML root element, where used for extensions
1530  # 'title' may not be 100% valid here; it's XHTML
1531  # https://www.w3.org/TR/REC-MathML/
1532  'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1533 
1534  // HTML 5 section 4.5
1535  'figure' => $common,
1536  'figcaption' => $common,
1537 
1538  # HTML 5 section 4.6
1539  'bdi' => $common,
1540 
1541  # HTML5 elements, defined by:
1542  # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1543  'data' => $merge( $common, [ 'value' ] ),
1544  'time' => $merge( $common, [ 'datetime' ] ),
1545  'mark' => $common,
1546 
1547  // meta and link are only permitted by removeHTMLtags when Microdata
1548  // is enabled so we don't bother adding a conditional to hide these
1549  // Also meta and link are only valid in WikiText as Microdata elements
1550  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1551  // So we don't bother including $common attributes that have no purpose.
1552  'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1553  'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1554  ];
1555 
1556  return $allowed;
1557  }
1558 
1570  public static function stripAllTags( $html ) {
1571  // Use RemexHtml to tokenize $html and extract the text
1572  $handler = new RemexStripTagHandler;
1573  $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
1574  'ignoreErrors' => true,
1575  // don't ignore char refs, we want them to be decoded
1576  'ignoreNulls' => true,
1577  'skipPreprocess' => true,
1578  ] );
1579  $tokenizer->execute();
1580  $text = $handler->getResult();
1581 
1582  $text = self::normalizeWhitespace( $text );
1583  return $text;
1584  }
1585 
1597  public static function hackDocType() {
1598  $out = "<!DOCTYPE html [\n";
1599  foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1600  if ( substr( $entity, -1 ) !== ';' ) {
1601  // Some HTML entities omit the trailing semicolon;
1602  // wikitext does not permit these.
1603  continue;
1604  }
1605  $name = substr( $entity, 0, -1 );
1606  $expansion = self::normalizeEntity( $entity );
1607  if ( $entity === $expansion ) {
1608  // Skip &lt; &gt; etc
1609  continue;
1610  }
1611  $out .= "<!ENTITY $name \"$expansion\">";
1612  }
1613  $out .= "]>\n";
1614  return $out;
1615  }
1616 
1621  public static function cleanUrl( $url ) {
1622  # Normalize any HTML entities in input. They will be
1623  # re-escaped by makeExternalLink().
1624  $url = self::decodeCharReferences( $url );
1625 
1626  # Escape any control characters introduced by the above step
1627  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1628  [ __CLASS__, 'cleanUrlCallback' ], $url );
1629 
1630  # Validate hostname portion
1631  $matches = [];
1632  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1633  list( /* $whole */, $protocol, $host, $rest ) = $matches;
1634 
1635  // Characters that will be ignored in IDNs.
1636  // https://tools.ietf.org/html/rfc3454#section-3.1
1637  // Strip them before further processing so deny lists and such work.
1638  $strip = "/
1639  \\s| # general whitespace
1640  \xc2\xad| # 00ad SOFT HYPHEN
1641  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1642  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1643  \xe2\x81\xa0| # 2060 WORD JOINER
1644  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1645  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1646  \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1647  \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1648  \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1649  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1650  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1651  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
1652  /xuD";
1653 
1654  $host = preg_replace( $strip, '', $host );
1655 
1656  // IPv6 host names are bracketed with []. Url-decode these.
1657  if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 &&
1658  preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1659  ) {
1660  $host = '//[' . $matches[1] . ']' . $matches[2];
1661  }
1662 
1663  // @todo FIXME: Validate hostnames here
1664 
1665  return $protocol . $host . $rest;
1666  } else {
1667  return $url;
1668  }
1669  }
1670 
1675  private static function cleanUrlCallback( $matches ) {
1676  return urlencode( $matches[0] );
1677  }
1678 
1707  public static function validateEmail( $addr ) {
1708  $result = null;
1709  // TODO This method should be non-static, and have a HookRunner injected
1710  if ( !Hooks::runner()->onIsValidEmailAddr( $addr, $result ) ) {
1711  return $result;
1712  }
1713 
1714  // Please note strings below are enclosed in brackets [], this make the
1715  // hyphen "-" a range indicator. Hence it is double backslashed below.
1716  // See T28948
1717  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1718  $rfc1034_ldh_str = "a-z0-9\\-";
1719 
1720  $html5_email_regexp = "/
1721  ^ # start of string
1722  [$rfc5322_atext\\.]+ # user part which is liberal :p
1723  @ # 'apostrophe'
1724  [$rfc1034_ldh_str]+ # First domain part
1725  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1726  $ # End of string
1727  /ix"; // case Insensitive, eXtended
1728 
1729  return (bool)preg_match( $html5_email_regexp, $addr );
1730  }
1731 }
Sanitizer\ID_FALLBACK
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
Definition: Sanitizer.php:78
Sanitizer\normalizeEntity
static normalizeEntity( $name)
If the named entity is defined in HTML5 return the equivalent numeric entity reference (except for th...
Definition: Sanitizer.php:1160
Sanitizer\getTagAttributeCallback
static getTagAttributeCallback( $set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
Definition: Sanitizer.php:1065
Sanitizer\removeHTMLcomments
static removeHTMLcomments( $text)
Remove '', and everything between.
Definition: Sanitizer.php:308
Sanitizer\stripAllTags
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
Definition: Sanitizer.php:1570
Sanitizer\EVIL_URI_PATTERN
const EVIL_URI_PATTERN
Pattern matching evil uris like javascript: WARNING: DO NOT use this in any place that actually requi...
Definition: Sanitizer.php:62
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:173
$wgExternalInterwikiFragmentMode
$wgExternalInterwikiFragmentMode
Which ID escaping mode should be used for external interwiki links? See documentation for $wgFragment...
Definition: DefaultSettings.php:3681
Sanitizer\escapeIdForAttribute
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
Definition: Sanitizer.php:811
Sanitizer\removeHTMLtags
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
Definition: Sanitizer.php:239
Sanitizer\decodeEntity
static decodeEntity( $name)
If the named entity is defined in HTML5 return the UTF-8 encoding of that character.
Definition: Sanitizer.php:1300
Sanitizer\mergeAttributes
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:541
Sanitizer\validateAttributes
static validateAttributes( $attribs, $allowed)
Take an array of attribute names and values and normalize or discard illegal values.
Definition: Sanitizer.php:413
Sanitizer\normalizeSectionNameWhitespace
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
Definition: Sanitizer.php:1104
Sanitizer\escapeIdInternal
static escapeIdInternal( $id, $mode)
Helper for escapeIdFor*() functions.
Definition: Sanitizer.php:893
Sanitizer\validateEmail
static validateEmail( $addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:1707
Sanitizer\attributesAllowedInternal
static attributesAllowedInternal( $element)
Fetch the list of acceptable attributes for a given element name.
Definition: Sanitizer.php:1316
Sanitizer\decCharReference
static decCharReference( $codepoint)
Definition: Sanitizer.php:1181
Sanitizer\safeEncodeTagAttributes
static safeEncodeTagAttributes( $assoc_array)
Build a partial tag string from an associative array of attribute names and values as returned by dec...
Definition: Sanitizer.php:1046
Sanitizer\decodeCharReferencesAndNormalize
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1245
Sanitizer\$attribsRegex
static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
Definition: Sanitizer.php:92
$wgFragmentMode
$wgFragmentMode
How should section IDs be encoded? This array can contain 1 or 2 elements, each of them can be one of...
Definition: DefaultSettings.php:3671
Sanitizer\escapeClass
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:972
Sanitizer\normalizeCharReferencesCallback
static normalizeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1134
Sanitizer\armorFrenchSpaces
static armorFrenchSpaces( $text, $space='&#160;')
Armor French spaces with a replacement character.
Definition: Sanitizer.php:746
RemexStripTagHandler
Definition: RemexStripTagHandler.php:9
Sanitizer\validateTag
static validateTag( $params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
Definition: Sanitizer.php:354
Sanitizer\$attribNameRegex
static $attribNameRegex
Lazy-initialised attribute name regex, see getAttribNameRegex()
Definition: Sanitizer.php:123
Sanitizer\hackDocType
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1597
Sanitizer\XMLNS_ATTRIBUTE_PATTERN
const XMLNS_ATTRIBUTE_PATTERN
Definition: Sanitizer.php:63
$wgAllowImageTag
$wgAllowImageTag
A different approach to the above: simply allow the "<img>" tag to be used.
Definition: DefaultSettings.php:4724
wfDeprecatedMsg
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
Definition: GlobalFunctions.php:1066
MWException
MediaWiki exception.
Definition: MWException.php:29
wfDeprecated
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that $function is deprecated.
Definition: GlobalFunctions.php:1034
Sanitizer\safeEncodeAttribute
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
Definition: Sanitizer.php:765
$matches
$matches
Definition: NoLocalSettings.php:24
Sanitizer\encodeAttribute
static encodeAttribute( $text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:723
$args
if( $line===false) $args
Definition: mcc.php:124
Sanitizer\escapeIdReferenceListInternal
static escapeIdReferenceListInternal( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
Definition: Sanitizer.php:945
Sanitizer\hexCharReference
static hexCharReference( $codepoint)
Definition: Sanitizer.php:1194
Sanitizer\validateCodepoint
static validateCodepoint( $codepoint)
Returns true if a given Unicode codepoint is a valid character in both HTML5 and XML.
Definition: Sanitizer.php:1209
wfUrlProtocols
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
Definition: GlobalFunctions.php:723
Sanitizer\MW_ENTITY_ALIASES
const MW_ENTITY_ALIASES
Character entity aliases accepted by MediaWiki in wikitext.
Definition: Sanitizer.php:84
Sanitizer\validateTagAttributes
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:390
Sanitizer\ELEMENT_BITS_REGEX
const ELEMENT_BITS_REGEX
Acceptable tag name charset from HTML5 parsing spec https://www.w3.org/TR/html5/syntax....
Definition: Sanitizer.php:51
Hooks\runner
static runner()
Get a HookRunner instance for calling hooks using the new interfaces.
Definition: Hooks.php:172
Sanitizer\escapeIdForExternalInterwiki
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:861
Sanitizer\isReservedDataAttribute
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
Definition: Sanitizer.php:520
Sanitizer\cleanUrl
static cleanUrl( $url)
Definition: Sanitizer.php:1621
Sanitizer\setupAttributesAllowedInternal
static setupAttributesAllowedInternal()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
Definition: Sanitizer.php:1328
Sanitizer\cssDecodeCallback
static cssDecodeCallback( $matches)
Definition: Sanitizer.php:661
Sanitizer\getRecognizedTagData
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
Definition: Sanitizer.php:144
Sanitizer\fixTagAttributes
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
Definition: Sanitizer.php:703
Sanitizer\escapeIdInternalUrl
static escapeIdInternalUrl( $id, $mode)
Do percent encoding of percent signs for href (but not id) attributes.
Definition: Sanitizer.php:878
Sanitizer\normalizeCss
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
Definition: Sanitizer.php:562
Sanitizer\ID_PRIMARY
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
Definition: Sanitizer.php:70
Sanitizer\CHAR_REFS_REGEX
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:41
Sanitizer\normalizeWhitespace
static normalizeWhitespace( $text)
Definition: Sanitizer.php:1089
Sanitizer\decodeChar
static decodeChar( $codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1284
Sanitizer\getAttribNameRegex
static getAttribNameRegex()
Used in Sanitizer::decodeTagAttributes to filter attributes.
Definition: Sanitizer.php:129
StringUtils\delimiterReplace
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
Definition: StringUtils.php:248
Sanitizer\normalizeCharReferences
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1123
Sanitizer\escapeIdForLink
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:838
Sanitizer\decodeTagAttributes
static decodeTagAttributes( $text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:1004
$t
$t
Definition: testCompression.php:74
Sanitizer\decodeCharReferences
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:1228
Sanitizer
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:34
Sanitizer\checkCss
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:632
Sanitizer\escapeIdReferenceList
static escapeIdReferenceList( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
Definition: Sanitizer.php:933
Sanitizer\getAttribsRegex
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:100
Sanitizer\escapeHtmlAllowEntities
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:987
Sanitizer\decodeCharReferencesCallback
static decodeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1265
Sanitizer\cleanUrlCallback
static cleanUrlCallback( $matches)
Definition: Sanitizer.php:1675