MediaWiki  master
Sanitizer.php
Go to the documentation of this file.
1 <?php
28 use RemexHtml\HTMLData;
29 
34 class Sanitizer {
41  private const CHAR_REFS_REGEX =
42  '/&([A-Za-z0-9\x80-\xff]+;)
43  |&\#([0-9]+);
44  |&\#[xX]([0-9A-Fa-f]+);
45  |(&)/x';
46 
51  private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
52 
62  private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
63  private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
64 
70  public const ID_PRIMARY = 0;
71 
78  public const ID_FALLBACK = 1;
79 
84  private const MW_ENTITY_ALIASES = [
85  'רלמ;' => 'rlm;',
86  'رلم;' => 'rlm;',
87  ];
88 
92  private static $attribsRegex;
93 
100  private static function getAttribsRegex() {
101  if ( self::$attribsRegex === null ) {
102  $spaceChars = '\x09\x0a\x0c\x0d\x20';
103  $space = "[{$spaceChars}]";
104  $attrib = "[^{$spaceChars}\/>=]";
105  $attribFirst = "(?:{$attrib}|=)";
106  self::$attribsRegex =
107  "/({$attribFirst}{$attrib}*)
108  ($space*=$space*
109  (?:
110  # The attribute value: quoted or alone
111  \"([^\"]*)(?:\"|\$)
112  | '([^']*)(?:'|\$)
113  | (((?!$space|>).)*)
114  )
115  )?/sxu";
116  }
117  return self::$attribsRegex;
118  }
119 
123  private static $attribNameRegex;
124 
129  private static function getAttribNameRegex() {
130  if ( self::$attribNameRegex === null ) {
131  $attribFirst = "[:_\p{L}\p{N}]";
132  $attrib = "[:_\.\-\p{L}\p{N}]";
133  self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
134  }
135  return self::$attribNameRegex;
136  }
137 
144  public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
145  global $wgAllowImageTag;
146 
147  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
148  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
149 
150  // Base our staticInitialised variable off of the global config state so that if the globals
151  // are changed (like in the screwed up test system) we will re-initialise the settings.
152  $globalContext = $wgAllowImageTag;
153  if ( !$staticInitialised || $staticInitialised != $globalContext ) {
154  $htmlpairsStatic = [ # Tags that must be closed
155  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
156  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
157  'strike', 'strong', 'tt', 'var', 'div', 'center',
158  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
159  'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
160  'kbd', 'samp', 'data', 'time', 'mark'
161  ];
162  $htmlsingle = [
163  'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
164  ];
165 
166  # Elements that cannot have close tags. This is (not coincidentally)
167  # also the list of tags for which the HTML 5 parsing algorithm
168  # requires you to "acknowledge the token's self-closing flag", i.e.
169  # a self-closing tag like <br/> is not an HTML 5 parse error only
170  # for this list.
171  $htmlsingleonly = [
172  'br', 'wbr', 'hr', 'meta', 'link'
173  ];
174 
175  $htmlnest = [ # Tags that can be nested--??
176  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
177  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
178  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
179  ];
180  $tabletags = [ # Can only appear inside table, we will close them
181  'td', 'th', 'tr',
182  ];
183  $htmllist = [ # Tags used by list
184  'ul', 'ol',
185  ];
186  $listtags = [ # Tags that can appear in a list
187  'li',
188  ];
189 
190  if ( $wgAllowImageTag ) {
191  wfDeprecatedMsg( 'Setting $wgAllowImageTag to true ' .
192  'is deprecated since MediaWiki 1.35', '1.35', false, false );
193  $htmlsingle[] = 'img';
194  $htmlsingleonly[] = 'img';
195  }
196 
197  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
198  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
199 
200  # Convert them all to hashtables for faster lookup
201  $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
202  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
203  foreach ( $vars as $var ) {
204  $$var = array_flip( $$var );
205  }
206  $staticInitialised = $globalContext;
207  }
208 
209  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
210  $extratags = array_flip( $extratags );
211  $removetags = array_flip( $removetags );
212  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
213  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
214 
215  return [
216  'htmlpairs' => $htmlpairs,
217  'htmlsingle' => $htmlsingle,
218  'htmlsingleonly' => $htmlsingleonly,
219  'htmlnest' => $htmlnest,
220  'tabletags' => $tabletags,
221  'htmllist' => $htmllist,
222  'listtags' => $listtags,
223  'htmlsingleallowed' => $htmlsingleallowed,
224  'htmlelements' => $htmlelements,
225  ];
226  }
227 
239  public static function removeHTMLtags( $text, $processCallback = null,
240  $args = [], $extratags = [], $removetags = []
241  ) {
242  $tagData = self::getRecognizedTagData( $extratags, $removetags );
243  $htmlpairs = $tagData['htmlpairs'];
244  $htmlsingle = $tagData['htmlsingle'];
245  $htmlsingleonly = $tagData['htmlsingleonly'];
246  $htmlnest = $tagData['htmlnest'];
247  $tabletags = $tagData['tabletags'];
248  $htmllist = $tagData['htmllist'];
249  $listtags = $tagData['listtags'];
250  $htmlsingleallowed = $tagData['htmlsingleallowed'];
251  $htmlelements = $tagData['htmlelements'];
252 
253  # Remove HTML comments
254  $text = self::removeHTMLcomments( $text );
255  $bits = explode( '<', $text );
256  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
257 
258  # this might be possible using remex tidy itself
259  foreach ( $bits as $x ) {
260  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
261  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
262 
263  $badtag = false;
264  $t = strtolower( $t );
265  if ( isset( $htmlelements[$t] ) ) {
266  if ( is_callable( $processCallback ) ) {
267  call_user_func_array( $processCallback, [ &$params, $args ] );
268  }
269 
270  if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
271  // Remove the self-closing slash, to be consistent
272  // with HTML5 semantics. T134423
273  $brace = '>';
274  }
275  if ( !self::validateTag( $params, $t ) ) {
276  $badtag = true;
277  }
278 
279  $newparams = self::fixTagAttributes( $params, $t );
280  if ( !$badtag ) {
281  if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
282  # Interpret self-closing tags as empty tags even when
283  # HTML 5 would interpret them as start tags. Such input
284  # is commonly seen on Wikimedia wikis with this intention.
285  $brace = "></$t>";
286  }
287 
288  $rest = str_replace( '>', '&gt;', $rest );
289  $text .= "<$slash$t$newparams$brace$rest";
290  continue;
291  }
292  }
293  }
294  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
295  }
296  return $text;
297  }
298 
308  public static function removeHTMLcomments( $text ) {
309  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
310  $end = strpos( $text, '-->', $start + 4 );
311  if ( $end === false ) {
312  # Unterminated comment; bail out
313  break;
314  }
315 
316  $end += 3;
317 
318  # Trim space and newline if the comment is both
319  # preceded and followed by a newline
320  $spaceStart = max( $start - 1, 0 );
321  $spaceLen = $end - $spaceStart;
322  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
323  $spaceStart--;
324  $spaceLen++;
325  }
326  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
327  $spaceLen++;
328  }
329  if ( substr( $text, $spaceStart, 1 ) === "\n"
330  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
331  # Remove the comment, leading and trailing
332  # spaces, and leave only one newline.
333  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
334  } else {
335  # Remove just the comment.
336  $text = substr_replace( $text, '', $start, $end - $start );
337  }
338  }
339  return $text;
340  }
341 
354  private static function validateTag( $params, $element ) {
355  $params = self::decodeTagAttributes( $params );
356 
357  if ( $element == 'meta' || $element == 'link' ) {
358  if ( !isset( $params['itemprop'] ) ) {
359  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
360  return false;
361  }
362  if ( $element == 'meta' && !isset( $params['content'] ) ) {
363  // <meta> must have a content="" for the itemprop
364  return false;
365  }
366  if ( $element == 'link' && !isset( $params['href'] ) ) {
367  // <link> must have an associated href=""
368  return false;
369  }
370  }
371 
372  return true;
373  }
374 
390  public static function validateTagAttributes( $attribs, $element ) {
391  return self::validateAttributes( $attribs,
392  self::attributesAllowedInternal( $element ) );
393  }
394 
413  public static function validateAttributes( $attribs, $allowed ) {
414  if ( isset( $allowed[0] ) ) {
415  // Calling this function with a sequential array is
416  // deprecated. For now just convert it.
417  wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
418  $allowed = array_flip( $allowed );
419  }
420  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
421 
422  $out = [];
423  foreach ( $attribs as $attribute => $value ) {
424  # Allow XML namespace declaration to allow RDFa
425  if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
426  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
427  $out[$attribute] = $value;
428  }
429 
430  continue;
431  }
432 
433  # Allow any attribute beginning with "data-"
434  # However:
435  # * Disallow data attributes used by MediaWiki code
436  # * Ensure that the attribute is not namespaced by banning
437  # colons.
438  if ( (
439  !preg_match( '/^data-[^:]*$/i', $attribute ) &&
440  !array_key_exists( $attribute, $allowed )
441  ) || self::isReservedDataAttribute( $attribute ) ) {
442  continue;
443  }
444 
445  # Strip javascript "expression" from stylesheets.
446  # https://msdn.microsoft.com/en-us/library/ms537634.aspx
447  if ( $attribute == 'style' ) {
448  $value = self::checkCss( $value );
449  }
450 
451  # Escape HTML id attributes
452  if ( $attribute === 'id' ) {
453  $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
454  }
455 
456  # Escape HTML id reference lists
457  if ( $attribute === 'aria-describedby'
458  || $attribute === 'aria-flowto'
459  || $attribute === 'aria-labelledby'
460  || $attribute === 'aria-owns'
461  ) {
462  $value = self::escapeIdReferenceListInternal( $value );
463  }
464 
465  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
466  // Check them for sanity.
467  if ( $attribute === 'rel' || $attribute === 'rev'
468  # RDFa
469  || $attribute === 'about' || $attribute === 'property'
470  || $attribute === 'resource' || $attribute === 'datatype'
471  || $attribute === 'typeof'
472  # HTML5 microdata
473  || $attribute === 'itemid' || $attribute === 'itemprop'
474  || $attribute === 'itemref' || $attribute === 'itemscope'
475  || $attribute === 'itemtype'
476  ) {
477  // Paranoia. Allow "simple" values but suppress javascript
478  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
479  continue;
480  }
481  }
482 
483  # NOTE: even though elements using href/src are not allowed directly, supply
484  # validation code that can be used by tag hook handlers, etc
485  if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
486  if ( !preg_match( $hrefExp, $value ) ) {
487  continue; // drop any href or src attributes not using an allowed protocol.
488  // NOTE: this also drops all relative URLs
489  }
490  }
491 
492  if ( $attribute === 'tabindex' && $value !== '0' ) {
493  // Only allow tabindex of 0, which is useful for accessibility.
494  continue;
495  }
496 
497  // If this attribute was previously set, override it.
498  // Output should only have one attribute of each name.
499  $out[$attribute] = $value;
500  }
501 
502  # itemtype, itemid, itemref don't make sense without itemscope
503  if ( !array_key_exists( 'itemscope', $out ) ) {
504  unset( $out['itemtype'] );
505  unset( $out['itemid'] );
506  unset( $out['itemref'] );
507  }
508  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
509 
510  return $out;
511  }
512 
520  public static function isReservedDataAttribute( $attr ) {
521  // data-ooui is reserved for ooui.
522  // data-mw and data-parsoid are reserved for parsoid.
523  // data-mw-<name here> is reserved for extensions (or core) if
524  // they need to communicate some data to the client and want to be
525  // sure that it isn't coming from an untrusted user.
526  // We ignore the possibility of namespaces since user-generated HTML
527  // can't use them anymore.
528  return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
529  }
530 
541  public static function mergeAttributes( $a, $b ) {
542  $out = array_merge( $a, $b );
543  if ( isset( $a['class'] ) && isset( $b['class'] )
544  && is_string( $a['class'] ) && is_string( $b['class'] )
545  && $a['class'] !== $b['class']
546  ) {
547  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
548  -1, PREG_SPLIT_NO_EMPTY );
549  $out['class'] = implode( ' ', array_unique( $classes ) );
550  }
551  return $out;
552  }
553 
562  public static function normalizeCss( $value ) {
563  // Decode character references like &#123;
564  $value = self::decodeCharReferences( $value );
565 
566  // Decode escape sequences and line continuation
567  // See the grammar in the CSS 2 spec, appendix D.
568  // This has to be done AFTER decoding character references.
569  // This means it isn't possible for this function to return
570  // unsanitized escape sequences. It is possible to manufacture
571  // input that contains character references that decode to
572  // escape sequences that decode to character references, but
573  // it's OK for the return value to contain character references
574  // because the caller is supposed to escape those anyway.
575  static $decodeRegex;
576  if ( !$decodeRegex ) {
577  $space = '[\\x20\\t\\r\\n\\f]';
578  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
579  $backslash = '\\\\';
580  $decodeRegex = "/ $backslash
581  (?:
582  ($nl) | # 1. Line continuation
583  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
584  (.) | # 3. backslash cancelling special meaning
585  () | # 4. backslash at end of string
586  )/xu";
587  }
588  $value = preg_replace_callback( $decodeRegex,
589  [ __CLASS__, 'cssDecodeCallback' ], $value );
590 
591  // Let the value through if it's nothing but a single comment, to
592  // allow other functions which may reject it to pass some error
593  // message through.
594  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
595  // Remove any comments; IE gets token splitting wrong
596  // This must be done AFTER decoding character references and
597  // escape sequences, because those steps can introduce comments
598  // This step cannot introduce character references or escape
599  // sequences, because it replaces comments with spaces rather
600  // than removing them completely.
601  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
602 
603  // Remove anything after a comment-start token, to guard against
604  // incorrect client implementations.
605  $commentPos = strpos( $value, '/*' );
606  if ( $commentPos !== false ) {
607  $value = substr( $value, 0, $commentPos );
608  }
609  }
610 
611  return $value;
612  }
613 
632  public static function checkCss( $value ) {
633  $value = self::normalizeCss( $value );
634 
635  // Reject problematic keywords and control characters
636  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
637  strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
638  return '/* invalid control char */';
639  } elseif ( preg_match(
640  '! expression
641  | filter\s*:
642  | accelerator\s*:
643  | -o-link\s*:
644  | -o-link-source\s*:
645  | -o-replace\s*:
646  | url\s*\‍(
647  | image\s*\‍(
648  | image-set\s*\‍(
649  | attr\s*\‍([^)]+[\s,]+url
650  | var\s*\‍(
651  !ix', $value ) ) {
652  return '/* insecure input */';
653  }
654  return $value;
655  }
656 
661  private static function cssDecodeCallback( $matches ) {
662  if ( $matches[1] !== '' ) {
663  // Line continuation
664  return '';
665  } elseif ( $matches[2] !== '' ) {
666  $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
667  } elseif ( $matches[3] !== '' ) {
668  $char = $matches[3];
669  } else {
670  $char = '\\';
671  }
672  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
673  // These characters need to be escaped in strings
674  // Clean up the escape sequence to avoid parsing errors by clients
675  return '\\' . dechex( ord( $char ) ) . ' ';
676  } else {
677  // Decode unnecessary escape
678  return $char;
679  }
680  }
681 
703  public static function fixTagAttributes( $text, $element, $sorted = false ) {
704  if ( trim( $text ) == '' ) {
705  return '';
706  }
707 
708  $decoded = self::decodeTagAttributes( $text );
709  $stripped = self::validateTagAttributes( $decoded, $element );
710 
711  if ( $sorted ) {
712  ksort( $stripped );
713  }
714 
715  return self::safeEncodeTagAttributes( $stripped );
716  }
717 
723  public static function encodeAttribute( $text ) {
724  $encValue = htmlspecialchars( $text, ENT_QUOTES );
725 
726  // Whitespace is normalized during attribute decoding,
727  // so if we've been passed non-spaces we must encode them
728  // ahead of time or they won't be preserved.
729  $encValue = strtr( $encValue, [
730  "\n" => '&#10;',
731  "\r" => '&#13;',
732  "\t" => '&#9;',
733  ] );
734 
735  return $encValue;
736  }
737 
746  public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
747  // Replace $ with \$ and \ with \\
748  $space = preg_replace( '#(?<!\\\\‍)(\\$|\\\\‍)#', '\\\\$1', $space );
749  $fixtags = [
750  # French spaces, last one Guillemet-left
751  # only if there is something before the space
752  # and a non-word character after the punctuation.
753  '/(?<=\S) (?=[?:;!%»›](?!\w))/u' => "$space",
754  # French spaces, Guillemet-right
755  '/([«‹]) /u' => "\\1$space",
756  ];
757  return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
758  }
759 
766  public static function safeEncodeAttribute( $text ) {
767  $encValue = self::encodeAttribute( $text );
768 
769  # Templates and links may be expanded in later parsing,
770  # creating invalid or dangerous output. Suppress this.
771  $encValue = strtr( $encValue, [
772  '<' => '&lt;', // This should never happen,
773  '>' => '&gt;', // we've received invalid input
774  '"' => '&quot;', // which should have been escaped.
775  '{' => '&#123;',
776  '}' => '&#125;', // prevent unpaired language conversion syntax
777  '[' => '&#91;',
778  ']' => '&#93;',
779  "''" => '&#39;&#39;',
780  'ISBN' => '&#73;SBN',
781  'RFC' => '&#82;FC',
782  'PMID' => '&#80;MID',
783  '|' => '&#124;',
784  '__' => '&#95;_',
785  ] );
786 
787  # Armor against French spaces detection (T5158)
788  $encValue = self::armorFrenchSpaces( $encValue, '&#32;' );
789 
790  # Stupid hack
791  $encValue = preg_replace_callback(
792  '/((?i)' . wfUrlProtocols() . ')/',
793  function ( $matches ) {
794  return str_replace( ':', '&#58;', $matches[1] );
795  },
796  $encValue );
797  return $encValue;
798  }
799 
815  public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
816  global $wgFragmentMode;
817 
818  if ( !isset( $wgFragmentMode[$mode] ) ) {
819  if ( $mode === self::ID_PRIMARY ) {
820  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
821  }
822  return false;
823  }
824 
825  $internalMode = $wgFragmentMode[$mode];
826 
827  return self::escapeIdInternal( $id, $internalMode );
828  }
829 
842  public static function escapeIdForLink( $id ) {
843  global $wgFragmentMode;
844 
845  if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
846  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
847  }
848 
850 
851  $id = self::escapeIdInternalUrl( $id, $mode );
852 
853  return $id;
854  }
855 
865  public static function escapeIdForExternalInterwiki( $id ) {
867 
869 
870  return $id;
871  }
872 
882  private static function escapeIdInternalUrl( $id, $mode ) {
883  $id = self::escapeIdInternal( $id, $mode );
884  if ( $mode === 'html5' ) {
885  $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
886  }
887  return $id;
888  }
889 
897  private static function escapeIdInternal( $id, $mode ) {
898  // Truncate overly-long IDs. This isn't an HTML limit, it's just
899  // griefer protection. [T251506]
900  $id = mb_substr( $id, 0, 1024 );
901 
902  switch ( $mode ) {
903  case 'html5':
904  // html5 spec says ids must not have any of the following:
905  // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
906  // In practice, in wikitext, only tab, LF, CR (and SPACE) are
907  // possible using either Lua or html entities.
908  $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
909  break;
910  case 'legacy':
911  // This corresponds to 'noninitial' mode of the former escapeId()
912  static $replace = [
913  '%3A' => ':',
914  '%' => '.'
915  ];
916 
917  $id = urlencode( str_replace( ' ', '_', $id ) );
918  $id = strtr( $id, $replace );
919  break;
920  default:
921  throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
922  }
923 
924  return $id;
925  }
926 
937  public static function escapeIdReferenceList( $referenceString ) {
938  wfDeprecated( __METHOD__, '1.36' );
939  return self::escapeIdReferenceListInternal( $referenceString );
940  }
941 
949  private static function escapeIdReferenceListInternal( $referenceString ) {
950  # Explode the space delimited list string into an array of tokens
951  $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
952 
953  # Escape each token as an id
954  foreach ( $references as &$ref ) {
955  $ref = self::escapeIdForAttribute( $ref );
956  }
957 
958  # Merge the array back to a space delimited list string
959  # If the array is empty, the result will be an empty string ('')
960  $referenceString = implode( ' ', $references );
961 
962  return $referenceString;
963  }
964 
976  public static function escapeClass( $class ) {
977  // Convert ugly stuff to underscores and kill underscores in ugly places
978  return rtrim( preg_replace(
979  [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
980  '_',
981  $class ), '_' );
982  }
983 
991  public static function escapeHtmlAllowEntities( $html ) {
992  $html = self::decodeCharReferences( $html );
993  # It seems wise to escape ' as well as ", as a matter of course. Can't
994  # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
995  # don't cause the entire string to disappear.
996  $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
997  return $html;
998  }
999 
1008  public static function decodeTagAttributes( $text ) {
1009  if ( trim( $text ) == '' ) {
1010  return [];
1011  }
1012 
1013  $pairs = [];
1014  if ( !preg_match_all(
1015  self::getAttribsRegex(),
1016  $text,
1017  $pairs,
1018  PREG_SET_ORDER ) ) {
1019  return [];
1020  }
1021 
1022  $attribs = [];
1023  foreach ( $pairs as $set ) {
1024  $attribute = strtolower( $set[1] );
1025 
1026  // Filter attribute names with unacceptable characters
1027  if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1028  continue;
1029  }
1030 
1031  $value = self::getTagAttributeCallback( $set );
1032 
1033  // Normalize whitespace
1034  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1035  $value = trim( $value );
1036 
1037  // Decode character references
1038  $attribs[$attribute] = self::decodeCharReferences( $value );
1039  }
1040  return $attribs;
1041  }
1042 
1050  public static function safeEncodeTagAttributes( $assoc_array ) {
1051  $attribs = [];
1052  foreach ( $assoc_array as $attribute => $value ) {
1053  $encAttribute = htmlspecialchars( $attribute );
1054  $encValue = self::safeEncodeAttribute( $value );
1055 
1056  $attribs[] = "$encAttribute=\"$encValue\"";
1057  }
1058  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1059  }
1060 
1069  private static function getTagAttributeCallback( $set ) {
1070  if ( isset( $set[5] ) ) {
1071  # No quotes.
1072  return $set[5];
1073  } elseif ( isset( $set[4] ) ) {
1074  # Single-quoted
1075  return $set[4];
1076  } elseif ( isset( $set[3] ) ) {
1077  # Double-quoted
1078  return $set[3];
1079  } elseif ( !isset( $set[2] ) ) {
1080  # In XHTML, attributes must have a value so return an empty string.
1081  # See "Empty attribute syntax",
1082  # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1083  return "";
1084  } else {
1085  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1086  }
1087  }
1088 
1093  private static function normalizeWhitespace( $text ) {
1094  return trim( preg_replace(
1095  '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1096  ' ',
1097  $text ) );
1098  }
1099 
1108  public static function normalizeSectionNameWhitespace( $section ) {
1109  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1110  }
1111 
1127  public static function normalizeCharReferences( $text ) {
1128  return preg_replace_callback(
1129  self::CHAR_REFS_REGEX,
1130  [ self::class, 'normalizeCharReferencesCallback' ],
1131  $text );
1132  }
1133 
1138  private static function normalizeCharReferencesCallback( $matches ) {
1139  $ret = null;
1140  if ( $matches[1] != '' ) {
1141  $ret = self::normalizeEntity( $matches[1] );
1142  } elseif ( $matches[2] != '' ) {
1143  $ret = self::decCharReference( $matches[2] );
1144  } elseif ( $matches[3] != '' ) {
1145  $ret = self::hexCharReference( $matches[3] );
1146  }
1147  if ( $ret === null ) {
1148  return htmlspecialchars( $matches[0] );
1149  } else {
1150  return $ret;
1151  }
1152  }
1153 
1164  private static function normalizeEntity( $name ) {
1165  if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1166  // Non-standard MediaWiki-specific entities
1167  return '&' . self::MW_ENTITY_ALIASES[$name];
1168  } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
1169  // Keep these in word form
1170  return "&$name";
1171  } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1172  // Beware: some entities expand to more than 1 codepoint
1173  return preg_replace_callback( '/./Ssu', function ( $m ) {
1174  return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
1175  }, HTMLData::$namedEntityTranslations[$name] );
1176  } else {
1177  return "&amp;$name";
1178  }
1179  }
1180 
1185  private static function decCharReference( $codepoint ) {
1186  $point = intval( $codepoint );
1187  if ( self::validateCodepoint( $point ) ) {
1188  return sprintf( '&#%d;', $point );
1189  } else {
1190  return null;
1191  }
1192  }
1193 
1198  private static function hexCharReference( $codepoint ) {
1199  $point = hexdec( $codepoint );
1200  if ( self::validateCodepoint( $point ) ) {
1201  return sprintf( '&#x%x;', $point );
1202  } else {
1203  return null;
1204  }
1205  }
1206 
1213  private static function validateCodepoint( $codepoint ) {
1214  # U+000C is valid in HTML5 but not allowed in XML.
1215  # U+000D is valid in XML but not allowed in HTML5.
1216  # U+007F - U+009F are disallowed in HTML5 (control characters).
1217  return $codepoint == 0x09
1218  || $codepoint == 0x0a
1219  || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1220  || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1221  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1222  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1223  }
1224 
1232  public static function decodeCharReferences( $text ) {
1233  return preg_replace_callback(
1234  self::CHAR_REFS_REGEX,
1235  [ self::class, 'decodeCharReferencesCallback' ],
1236  $text );
1237  }
1238 
1249  public static function decodeCharReferencesAndNormalize( $text ) {
1250  $text = preg_replace_callback(
1251  self::CHAR_REFS_REGEX,
1252  [ self::class, 'decodeCharReferencesCallback' ],
1253  $text,
1254  -1, // limit
1255  $count
1256  );
1257 
1258  if ( $count ) {
1259  return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1260  } else {
1261  return $text;
1262  }
1263  }
1264 
1269  private static function decodeCharReferencesCallback( $matches ) {
1270  if ( $matches[1] != '' ) {
1271  return self::decodeEntity( $matches[1] );
1272  } elseif ( $matches[2] != '' ) {
1273  return self::decodeChar( intval( $matches[2] ) );
1274  } elseif ( $matches[3] != '' ) {
1275  return self::decodeChar( hexdec( $matches[3] ) );
1276  }
1277  # Last case should be an ampersand by itself
1278  return $matches[0];
1279  }
1280 
1288  private static function decodeChar( $codepoint ) {
1289  if ( self::validateCodepoint( $codepoint ) ) {
1290  return UtfNormal\Utils::codepointToUtf8( $codepoint );
1291  } else {
1292  return UtfNormal\Constants::UTF8_REPLACEMENT;
1293  }
1294  }
1295 
1304  private static function decodeEntity( $name ) {
1305  // These are MediaWiki-specific entities, not in the HTML standard
1306  if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1307  $name = self::MW_ENTITY_ALIASES[$name];
1308  }
1309  $trans = HTMLData::$namedEntityTranslations[$name] ?? null;
1310  return $trans ?? "&$name";
1311  }
1312 
1320  private static function attributesAllowedInternal( $element ) {
1322  return $list[$element] ?? [];
1323  }
1324 
1332  private static function setupAttributesAllowedInternal() {
1333  static $allowed;
1334 
1335  if ( $allowed !== null ) {
1336  return $allowed;
1337  }
1338 
1339  // For lookup efficiency flip each attributes array so the keys are
1340  // the valid attributes.
1341  $merge = function ( $a, $b, $c = [] ) {
1342  return array_merge( $a, array_flip( $b ), array_flip( $c ) );
1343  };
1344  $common = $merge( [], [
1345  # HTML
1346  'id',
1347  'class',
1348  'style',
1349  'lang',
1350  'dir',
1351  'title',
1352  'tabindex',
1353 
1354  # WAI-ARIA
1355  'aria-describedby',
1356  'aria-flowto',
1357  'aria-hidden',
1358  'aria-label',
1359  'aria-labelledby',
1360  'aria-owns',
1361  'role',
1362 
1363  # RDFa
1364  # These attributes are specified in section 9 of
1365  # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1366  'about',
1367  'property',
1368  'resource',
1369  'datatype',
1370  'typeof',
1371 
1372  # Microdata. These are specified by
1373  # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1374  'itemid',
1375  'itemprop',
1376  'itemref',
1377  'itemscope',
1378  'itemtype',
1379  ] );
1380 
1381  $block = $merge( $common, [ 'align' ] );
1382 
1383  $tablealign = [ 'align', 'valign' ];
1384  $tablecell = [
1385  'abbr',
1386  'axis',
1387  'headers',
1388  'scope',
1389  'rowspan',
1390  'colspan',
1391  'nowrap', # deprecated
1392  'width', # deprecated
1393  'height', # deprecated
1394  'bgcolor', # deprecated
1395  ];
1396 
1397  # Numbers refer to sections in HTML 4.01 standard describing the element.
1398  # See: https://www.w3.org/TR/html4/
1399  $allowed = [
1400  # 7.5.4
1401  'div' => $block,
1402  'center' => $common, # deprecated
1403  'span' => $common,
1404 
1405  # 7.5.5
1406  'h1' => $block,
1407  'h2' => $block,
1408  'h3' => $block,
1409  'h4' => $block,
1410  'h5' => $block,
1411  'h6' => $block,
1412 
1413  # 7.5.6
1414  # address
1415 
1416  # 8.2.4
1417  'bdo' => $common,
1418 
1419  # 9.2.1
1420  'em' => $common,
1421  'strong' => $common,
1422  'cite' => $common,
1423  'dfn' => $common,
1424  'code' => $common,
1425  'samp' => $common,
1426  'kbd' => $common,
1427  'var' => $common,
1428  'abbr' => $common,
1429  # acronym
1430 
1431  # 9.2.2
1432  'blockquote' => $merge( $common, [ 'cite' ] ),
1433  'q' => $merge( $common, [ 'cite' ] ),
1434 
1435  # 9.2.3
1436  'sub' => $common,
1437  'sup' => $common,
1438 
1439  # 9.3.1
1440  'p' => $block,
1441 
1442  # 9.3.2
1443  'br' => $merge( $common, [ 'clear' ] ),
1444 
1445  # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1446  'wbr' => $common,
1447 
1448  # 9.3.4
1449  'pre' => $merge( $common, [ 'width' ] ),
1450 
1451  # 9.4
1452  'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1453  'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1454 
1455  # 10.2
1456  'ul' => $merge( $common, [ 'type' ] ),
1457  'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1458  'li' => $merge( $common, [ 'type', 'value' ] ),
1459 
1460  # 10.3
1461  'dl' => $common,
1462  'dd' => $common,
1463  'dt' => $common,
1464 
1465  # 11.2.1
1466  'table' => $merge( $common,
1467  [ 'summary', 'width', 'border', 'frame',
1468  'rules', 'cellspacing', 'cellpadding',
1469  'align', 'bgcolor',
1470  ] ),
1471 
1472  # 11.2.2
1473  'caption' => $block,
1474 
1475  # 11.2.3
1476  'thead' => $common,
1477  'tfoot' => $common,
1478  'tbody' => $common,
1479 
1480  # 11.2.4
1481  'colgroup' => $merge( $common, [ 'span' ] ),
1482  'col' => $merge( $common, [ 'span' ] ),
1483 
1484  # 11.2.5
1485  'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1486 
1487  # 11.2.6
1488  'td' => $merge( $common, $tablecell, $tablealign ),
1489  'th' => $merge( $common, $tablecell, $tablealign ),
1490 
1491  # 12.2
1492  # NOTE: <a> is not allowed directly, but this list of allowed
1493  # attributes is used from the Parser object
1494  'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1495 
1496  # 13.2
1497  # Not usually allowed, but may be used for extension-style hooks
1498  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1499  # true
1500  'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1501  # Attributes for A/V tags added in T163583 / T133673
1502  'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1503  'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1504  'source' => $merge( $common, [ 'type', 'src' ] ),
1505  'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1506 
1507  # 15.2.1
1508  'tt' => $common,
1509  'b' => $common,
1510  'i' => $common,
1511  'big' => $common,
1512  'small' => $common,
1513  'strike' => $common,
1514  's' => $common,
1515  'u' => $common,
1516 
1517  # 15.2.2
1518  'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1519  # basefont
1520 
1521  # 15.3
1522  'hr' => $merge( $common, [ 'width' ] ),
1523 
1524  # HTML Ruby annotation text module, simple ruby only.
1525  # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1526  'ruby' => $common,
1527  # rbc
1528  'rb' => $common,
1529  'rp' => $common,
1530  'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1531  'rtc' => $common,
1532 
1533  # MathML root element, where used for extensions
1534  # 'title' may not be 100% valid here; it's XHTML
1535  # https://www.w3.org/TR/REC-MathML/
1536  'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1537 
1538  // HTML 5 section 4.5
1539  'figure' => $common,
1540  'figcaption' => $common,
1541 
1542  # HTML 5 section 4.6
1543  'bdi' => $common,
1544 
1545  # HTML5 elements, defined by:
1546  # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1547  'data' => $merge( $common, [ 'value' ] ),
1548  'time' => $merge( $common, [ 'datetime' ] ),
1549  'mark' => $common,
1550 
1551  // meta and link are only permitted by removeHTMLtags when Microdata
1552  // is enabled so we don't bother adding a conditional to hide these
1553  // Also meta and link are only valid in WikiText as Microdata elements
1554  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1555  // So we don't bother including $common attributes that have no purpose.
1556  'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1557  'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1558  ];
1559 
1560  return $allowed;
1561  }
1562 
1574  public static function stripAllTags( $html ) {
1575  // Use RemexHtml to tokenize $html and extract the text
1576  $handler = new RemexStripTagHandler;
1577  $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
1578  'ignoreErrors' => true,
1579  // don't ignore char refs, we want them to be decoded
1580  'ignoreNulls' => true,
1581  'skipPreprocess' => true,
1582  ] );
1583  $tokenizer->execute();
1584  $text = $handler->getResult();
1585 
1586  $text = self::normalizeWhitespace( $text );
1587  return $text;
1588  }
1589 
1601  public static function hackDocType() {
1602  $out = "<!DOCTYPE html [\n";
1603  foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1604  if ( substr( $entity, -1 ) !== ';' ) {
1605  // Some HTML entities omit the trailing semicolon;
1606  // wikitext does not permit these.
1607  continue;
1608  }
1609  $name = substr( $entity, 0, -1 );
1610  $expansion = self::normalizeEntity( $entity );
1611  if ( $entity === $expansion ) {
1612  // Skip &lt; &gt; etc
1613  continue;
1614  }
1615  $out .= "<!ENTITY $name \"$expansion\">";
1616  }
1617  $out .= "]>\n";
1618  return $out;
1619  }
1620 
1625  public static function cleanUrl( $url ) {
1626  # Normalize any HTML entities in input. They will be
1627  # re-escaped by makeExternalLink().
1628  $url = self::decodeCharReferences( $url );
1629 
1630  # Escape any control characters introduced by the above step
1631  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1632  [ __CLASS__, 'cleanUrlCallback' ], $url );
1633 
1634  # Validate hostname portion
1635  $matches = [];
1636  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1637  list( /* $whole */, $protocol, $host, $rest ) = $matches;
1638 
1639  // Characters that will be ignored in IDNs.
1640  // https://tools.ietf.org/html/rfc3454#section-3.1
1641  // Strip them before further processing so deny lists and such work.
1642  $strip = "/
1643  \\s| # general whitespace
1644  \xc2\xad| # 00ad SOFT HYPHEN
1645  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1646  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1647  \xe2\x81\xa0| # 2060 WORD JOINER
1648  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1649  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1650  \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1651  \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1652  \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1653  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1654  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1655  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
1656  /xuD";
1657 
1658  $host = preg_replace( $strip, '', $host );
1659 
1660  // IPv6 host names are bracketed with []. Url-decode these.
1661  if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 &&
1662  preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1663  ) {
1664  $host = '//[' . $matches[1] . ']' . $matches[2];
1665  }
1666 
1667  // @todo FIXME: Validate hostnames here
1668 
1669  return $protocol . $host . $rest;
1670  } else {
1671  return $url;
1672  }
1673  }
1674 
1679  private static function cleanUrlCallback( $matches ) {
1680  return urlencode( $matches[0] );
1681  }
1682 
1711  public static function validateEmail( $addr ) {
1712  $result = null;
1713  // TODO This method should be non-static, and have a HookRunner injected
1714  if ( !Hooks::runner()->onIsValidEmailAddr( $addr, $result ) ) {
1715  return $result;
1716  }
1717 
1718  // Please note strings below are enclosed in brackets [], this make the
1719  // hyphen "-" a range indicator. Hence it is double backslashed below.
1720  // See T28948
1721  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1722  $rfc1034_ldh_str = "a-z0-9\\-";
1723 
1724  $html5_email_regexp = "/
1725  ^ # start of string
1726  [$rfc5322_atext\\.]+ # user part which is liberal :p
1727  @ # 'apostrophe'
1728  [$rfc1034_ldh_str]+ # First domain part
1729  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1730  $ # End of string
1731  /ix"; // case Insensitive, eXtended
1732 
1733  return (bool)preg_match( $html5_email_regexp, $addr );
1734  }
1735 }
Sanitizer\ID_FALLBACK
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
Definition: Sanitizer.php:78
Sanitizer\normalizeEntity
static normalizeEntity( $name)
If the named entity is defined in HTML5 return the equivalent numeric entity reference (except for th...
Definition: Sanitizer.php:1164
Sanitizer\getTagAttributeCallback
static getTagAttributeCallback( $set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
Definition: Sanitizer.php:1069
Sanitizer\removeHTMLcomments
static removeHTMLcomments( $text)
Remove '', and everything between.
Definition: Sanitizer.php:308
Sanitizer\stripAllTags
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
Definition: Sanitizer.php:1574
Sanitizer\EVIL_URI_PATTERN
const EVIL_URI_PATTERN
Pattern matching evil uris like javascript: WARNING: DO NOT use this in any place that actually requi...
Definition: Sanitizer.php:62
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:165
$wgExternalInterwikiFragmentMode
$wgExternalInterwikiFragmentMode
Which ID escaping mode should be used for external interwiki links? See documentation for $wgFragment...
Definition: DefaultSettings.php:3602
Sanitizer\escapeIdForAttribute
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
Definition: Sanitizer.php:815
Sanitizer\removeHTMLtags
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
Definition: Sanitizer.php:239
Sanitizer\decodeEntity
static decodeEntity( $name)
If the named entity is defined in HTML5 return the UTF-8 encoding of that character.
Definition: Sanitizer.php:1304
Sanitizer\mergeAttributes
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:541
Sanitizer\validateAttributes
static validateAttributes( $attribs, $allowed)
Take an array of attribute names and values and normalize or discard illegal values.
Definition: Sanitizer.php:413
Sanitizer\normalizeSectionNameWhitespace
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
Definition: Sanitizer.php:1108
Sanitizer\escapeIdInternal
static escapeIdInternal( $id, $mode)
Helper for escapeIdFor*() functions.
Definition: Sanitizer.php:897
Sanitizer\validateEmail
static validateEmail( $addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:1711
Sanitizer\attributesAllowedInternal
static attributesAllowedInternal( $element)
Fetch the list of acceptable attributes for a given element name.
Definition: Sanitizer.php:1320
Sanitizer\decCharReference
static decCharReference( $codepoint)
Definition: Sanitizer.php:1185
Sanitizer\safeEncodeTagAttributes
static safeEncodeTagAttributes( $assoc_array)
Build a partial tag string from an associative array of attribute names and values as returned by dec...
Definition: Sanitizer.php:1050
Sanitizer\decodeCharReferencesAndNormalize
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1249
Sanitizer\$attribsRegex
static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
Definition: Sanitizer.php:92
$wgFragmentMode
$wgFragmentMode
How should section IDs be encoded? This array can contain 1 or 2 elements, each of them can be one of...
Definition: DefaultSettings.php:3592
Sanitizer\escapeClass
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:976
Sanitizer\normalizeCharReferencesCallback
static normalizeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1138
Sanitizer\armorFrenchSpaces
static armorFrenchSpaces( $text, $space='&#160;')
Armor French spaces with a replacement character.
Definition: Sanitizer.php:746
RemexStripTagHandler
Definition: RemexStripTagHandler.php:9
Sanitizer\validateTag
static validateTag( $params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
Definition: Sanitizer.php:354
Sanitizer\$attribNameRegex
static $attribNameRegex
Lazy-initialised attribute name regex, see getAttribNameRegex()
Definition: Sanitizer.php:123
Sanitizer\hackDocType
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1601
Sanitizer\XMLNS_ATTRIBUTE_PATTERN
const XMLNS_ATTRIBUTE_PATTERN
Definition: Sanitizer.php:63
$wgAllowImageTag
$wgAllowImageTag
A different approach to the above: simply allow the "<img>" tag to be used.
Definition: DefaultSettings.php:4648
wfDeprecatedMsg
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
Definition: GlobalFunctions.php:1062
MWException
MediaWiki exception.
Definition: MWException.php:29
wfDeprecated
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that $function is deprecated.
Definition: GlobalFunctions.php:1030
Sanitizer\safeEncodeAttribute
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
Definition: Sanitizer.php:766
$matches
$matches
Definition: NoLocalSettings.php:24
Sanitizer\encodeAttribute
static encodeAttribute( $text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:723
$args
if( $line===false) $args
Definition: mcc.php:124
Sanitizer\escapeIdReferenceListInternal
static escapeIdReferenceListInternal( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
Definition: Sanitizer.php:949
Sanitizer\hexCharReference
static hexCharReference( $codepoint)
Definition: Sanitizer.php:1198
Sanitizer\validateCodepoint
static validateCodepoint( $codepoint)
Returns true if a given Unicode codepoint is a valid character in both HTML5 and XML.
Definition: Sanitizer.php:1213
wfUrlProtocols
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
Definition: GlobalFunctions.php:721
Sanitizer\MW_ENTITY_ALIASES
const MW_ENTITY_ALIASES
Character entity aliases accepted by MediaWiki in wikitext.
Definition: Sanitizer.php:84
Sanitizer\validateTagAttributes
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:390
Sanitizer\ELEMENT_BITS_REGEX
const ELEMENT_BITS_REGEX
Acceptable tag name charset from HTML5 parsing spec https://www.w3.org/TR/html5/syntax....
Definition: Sanitizer.php:51
Hooks\runner
static runner()
Get a HookRunner instance for calling hooks using the new interfaces.
Definition: Hooks.php:172
Sanitizer\escapeIdForExternalInterwiki
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:865
Sanitizer\isReservedDataAttribute
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
Definition: Sanitizer.php:520
Sanitizer\cleanUrl
static cleanUrl( $url)
Definition: Sanitizer.php:1625
Sanitizer\setupAttributesAllowedInternal
static setupAttributesAllowedInternal()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
Definition: Sanitizer.php:1332
Sanitizer\cssDecodeCallback
static cssDecodeCallback( $matches)
Definition: Sanitizer.php:661
Sanitizer\getRecognizedTagData
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
Definition: Sanitizer.php:144
Sanitizer\fixTagAttributes
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
Definition: Sanitizer.php:703
Sanitizer\escapeIdInternalUrl
static escapeIdInternalUrl( $id, $mode)
Do percent encoding of percent signs for href (but not id) attributes.
Definition: Sanitizer.php:882
Sanitizer\normalizeCss
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
Definition: Sanitizer.php:562
Sanitizer\ID_PRIMARY
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
Definition: Sanitizer.php:70
Sanitizer\CHAR_REFS_REGEX
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:41
Sanitizer\normalizeWhitespace
static normalizeWhitespace( $text)
Definition: Sanitizer.php:1093
Sanitizer\decodeChar
static decodeChar( $codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1288
Sanitizer\getAttribNameRegex
static getAttribNameRegex()
Used in Sanitizer::decodeTagAttributes to filter attributes.
Definition: Sanitizer.php:129
StringUtils\delimiterReplace
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
Definition: StringUtils.php:248
Sanitizer\normalizeCharReferences
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1127
Sanitizer\escapeIdForLink
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:842
Sanitizer\decodeTagAttributes
static decodeTagAttributes( $text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:1008
$t
$t
Definition: testCompression.php:74
Sanitizer\decodeCharReferences
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:1232
Sanitizer
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:34
Sanitizer\checkCss
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:632
Sanitizer\escapeIdReferenceList
static escapeIdReferenceList( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
Definition: Sanitizer.php:937
Sanitizer\getAttribsRegex
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:100
Sanitizer\escapeHtmlAllowEntities
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:991
Sanitizer\decodeCharReferencesCallback
static decodeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1269
Sanitizer\cleanUrlCallback
static cleanUrlCallback( $matches)
Definition: Sanitizer.php:1679