MediaWiki  master
Sanitizer.php
Go to the documentation of this file.
1 <?php
28 use RemexHtml\HTMLData;
29 
34 class Sanitizer {
41  private const CHAR_REFS_REGEX =
42  '/&([A-Za-z0-9\x80-\xff]+;)
43  |&\#([0-9]+);
44  |&\#[xX]([0-9A-Fa-f]+);
45  |(&)/x';
46 
51  private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
52 
62  private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
63  private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
64 
70  public const ID_PRIMARY = 0;
71 
78  public const ID_FALLBACK = 1;
79 
84  private const MW_ENTITY_ALIASES = [
85  'רלמ;' => 'rlm;',
86  'رلم;' => 'rlm;',
87  ];
88 
92  private static $attribsRegex;
93 
100  private static function getAttribsRegex() {
101  if ( self::$attribsRegex === null ) {
102  $spaceChars = '\x09\x0a\x0c\x0d\x20';
103  $space = "[{$spaceChars}]";
104  $attrib = "[^{$spaceChars}\/>=]";
105  $attribFirst = "(?:{$attrib}|=)";
106  self::$attribsRegex =
107  "/({$attribFirst}{$attrib}*)
108  ($space*=$space*
109  (?:
110  # The attribute value: quoted or alone
111  \"([^\"]*)(?:\"|\$)
112  | '([^']*)(?:'|\$)
113  | (((?!$space|>).)*)
114  )
115  )?/sxu";
116  }
117  return self::$attribsRegex;
118  }
119 
123  private static $attribNameRegex;
124 
129  private static function getAttribNameRegex() {
130  if ( self::$attribNameRegex === null ) {
131  $attribFirst = "[:_\p{L}\p{N}]";
132  $attrib = "[:_\.\-\p{L}\p{N}]";
133  self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
134  }
135  return self::$attribNameRegex;
136  }
137 
144  public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
145  global $wgAllowImageTag;
146 
147  static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
148  $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
149 
150  // Base our staticInitialised variable off of the global config state so that if the globals
151  // are changed (like in the screwed up test system) we will re-initialise the settings.
152  $globalContext = $wgAllowImageTag;
153  if ( !$staticInitialised || $staticInitialised != $globalContext ) {
154  $htmlpairsStatic = [ # Tags that must be closed
155  'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
156  'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
157  'strike', 'strong', 'tt', 'var', 'div', 'center',
158  'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
159  'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
160  'kbd', 'samp', 'data', 'time', 'mark'
161  ];
162  $htmlsingle = [
163  'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
164  ];
165 
166  # Elements that cannot have close tags. This is (not coincidentally)
167  # also the list of tags for which the HTML 5 parsing algorithm
168  # requires you to "acknowledge the token's self-closing flag", i.e.
169  # a self-closing tag like <br/> is not an HTML 5 parse error only
170  # for this list.
171  $htmlsingleonly = [
172  'br', 'wbr', 'hr', 'meta', 'link'
173  ];
174 
175  $htmlnest = [ # Tags that can be nested--??
176  'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
177  'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
178  'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
179  ];
180  $tabletags = [ # Can only appear inside table, we will close them
181  'td', 'th', 'tr',
182  ];
183  $htmllist = [ # Tags used by list
184  'ul', 'ol',
185  ];
186  $listtags = [ # Tags that can appear in a list
187  'li',
188  ];
189 
190  if ( $wgAllowImageTag ) {
191  wfDeprecatedMsg( 'Setting $wgAllowImageTag to true ' .
192  'is deprecated since MediaWiki 1.35', '1.35', false, false );
193  $htmlsingle[] = 'img';
194  $htmlsingleonly[] = 'img';
195  }
196 
197  $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
198  $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
199 
200  # Convert them all to hashtables for faster lookup
201  $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
202  'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
203  foreach ( $vars as $var ) {
204  $$var = array_flip( $$var );
205  }
206  $staticInitialised = $globalContext;
207  }
208 
209  # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
210  $extratags = array_flip( $extratags );
211  $removetags = array_flip( $removetags );
212  $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
213  $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
214 
215  return [
216  'htmlpairs' => $htmlpairs,
217  'htmlsingle' => $htmlsingle,
218  'htmlsingleonly' => $htmlsingleonly,
219  'htmlnest' => $htmlnest,
220  'tabletags' => $tabletags,
221  'htmllist' => $htmllist,
222  'listtags' => $listtags,
223  'htmlsingleallowed' => $htmlsingleallowed,
224  'htmlelements' => $htmlelements,
225  ];
226  }
227 
239  public static function removeHTMLtags( $text, $processCallback = null,
240  $args = [], $extratags = [], $removetags = []
241  ) {
242  $tagData = self::getRecognizedTagData( $extratags, $removetags );
243  $htmlpairs = $tagData['htmlpairs'];
244  $htmlsingle = $tagData['htmlsingle'];
245  $htmlsingleonly = $tagData['htmlsingleonly'];
246  $htmlnest = $tagData['htmlnest'];
247  $tabletags = $tagData['tabletags'];
248  $htmllist = $tagData['htmllist'];
249  $listtags = $tagData['listtags'];
250  $htmlsingleallowed = $tagData['htmlsingleallowed'];
251  $htmlelements = $tagData['htmlelements'];
252 
253  # Remove HTML comments
254  $text = self::removeHTMLcomments( $text );
255  $bits = explode( '<', $text );
256  $text = str_replace( '>', '&gt;', array_shift( $bits ) );
257 
258  # this might be possible using remex tidy itself
259  foreach ( $bits as $x ) {
260  if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
261  list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
262 
263  $badtag = false;
264  $t = strtolower( $t );
265  if ( isset( $htmlelements[$t] ) ) {
266  if ( is_callable( $processCallback ) ) {
267  call_user_func_array( $processCallback, [ &$params, $args ] );
268  }
269 
270  if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
271  // Remove the self-closing slash, to be consistent
272  // with HTML5 semantics. T134423
273  $brace = '>';
274  }
275  if ( !self::validateTag( $params, $t ) ) {
276  $badtag = true;
277  }
278 
279  $newparams = self::fixTagAttributes( $params, $t );
280  if ( !$badtag ) {
281  if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
282  # Interpret self-closing tags as empty tags even when
283  # HTML 5 would interpret them as start tags. Such input
284  # is commonly seen on Wikimedia wikis with this intention.
285  $brace = "></$t>";
286  }
287 
288  $rest = str_replace( '>', '&gt;', $rest );
289  $text .= "<$slash$t$newparams$brace$rest";
290  continue;
291  }
292  }
293  }
294  $text .= '&lt;' . str_replace( '>', '&gt;', $x );
295  }
296  return $text;
297  }
298 
308  public static function removeHTMLcomments( $text ) {
309  while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
310  $end = strpos( $text, '-->', $start + 4 );
311  if ( $end === false ) {
312  # Unterminated comment; bail out
313  break;
314  }
315 
316  $end += 3;
317 
318  # Trim space and newline if the comment is both
319  # preceded and followed by a newline
320  $spaceStart = max( $start - 1, 0 );
321  $spaceLen = $end - $spaceStart;
322  while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
323  $spaceStart--;
324  $spaceLen++;
325  }
326  while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
327  $spaceLen++;
328  }
329  if ( substr( $text, $spaceStart, 1 ) === "\n"
330  && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
331  # Remove the comment, leading and trailing
332  # spaces, and leave only one newline.
333  $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
334  } else {
335  # Remove just the comment.
336  $text = substr_replace( $text, '', $start, $end - $start );
337  }
338  }
339  return $text;
340  }
341 
354  private static function validateTag( $params, $element ) {
355  $params = self::decodeTagAttributes( $params );
356 
357  if ( $element == 'meta' || $element == 'link' ) {
358  if ( !isset( $params['itemprop'] ) ) {
359  // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
360  return false;
361  }
362  if ( $element == 'meta' && !isset( $params['content'] ) ) {
363  // <meta> must have a content="" for the itemprop
364  return false;
365  }
366  if ( $element == 'link' && !isset( $params['href'] ) ) {
367  // <link> must have an associated href=""
368  return false;
369  }
370  }
371 
372  return true;
373  }
374 
390  public static function validateTagAttributes( $attribs, $element ) {
391  return self::validateAttributes( $attribs,
392  self::attributesAllowedInternal( $element ) );
393  }
394 
413  public static function validateAttributes( $attribs, $allowed ) {
414  if ( isset( $allowed[0] ) ) {
415  // Calling this function with a sequential array is
416  // deprecated. For now just convert it.
417  wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
418  $allowed = array_flip( $allowed );
419  }
420  $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
421 
422  $out = [];
423  foreach ( $attribs as $attribute => $value ) {
424  # Allow XML namespace declaration to allow RDFa
425  if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
426  if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
427  $out[$attribute] = $value;
428  }
429 
430  continue;
431  }
432 
433  # Allow any attribute beginning with "data-"
434  # However:
435  # * Disallow data attributes used by MediaWiki code
436  # * Ensure that the attribute is not namespaced by banning
437  # colons.
438  if ( (
439  !preg_match( '/^data-[^:]*$/i', $attribute ) &&
440  !array_key_exists( $attribute, $allowed )
441  ) || self::isReservedDataAttribute( $attribute ) ) {
442  continue;
443  }
444 
445  # Strip javascript "expression" from stylesheets.
446  # https://msdn.microsoft.com/en-us/library/ms537634.aspx
447  if ( $attribute == 'style' ) {
448  $value = self::checkCss( $value );
449  }
450 
451  # Escape HTML id attributes
452  if ( $attribute === 'id' ) {
453  $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
454  }
455 
456  # Escape HTML id reference lists
457  if ( $attribute === 'aria-describedby'
458  || $attribute === 'aria-flowto'
459  || $attribute === 'aria-labelledby'
460  || $attribute === 'aria-owns'
461  ) {
462  $value = self::escapeIdReferenceListInternal( $value );
463  }
464 
465  // RDFa and microdata properties allow URLs, URIs and/or CURIs.
466  // Check them for sanity.
467  if ( $attribute === 'rel' || $attribute === 'rev'
468  # RDFa
469  || $attribute === 'about' || $attribute === 'property'
470  || $attribute === 'resource' || $attribute === 'datatype'
471  || $attribute === 'typeof'
472  # HTML5 microdata
473  || $attribute === 'itemid' || $attribute === 'itemprop'
474  || $attribute === 'itemref' || $attribute === 'itemscope'
475  || $attribute === 'itemtype'
476  ) {
477  // Paranoia. Allow "simple" values but suppress javascript
478  if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
479  continue;
480  }
481  }
482 
483  # NOTE: even though elements using href/src are not allowed directly, supply
484  # validation code that can be used by tag hook handlers, etc
485  if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
486  if ( !preg_match( $hrefExp, $value ) ) {
487  continue; // drop any href or src attributes not using an allowed protocol.
488  // NOTE: this also drops all relative URLs
489  }
490  }
491 
492  if ( $attribute === 'tabindex' && $value !== '0' ) {
493  // Only allow tabindex of 0, which is useful for accessibility.
494  continue;
495  }
496 
497  // If this attribute was previously set, override it.
498  // Output should only have one attribute of each name.
499  $out[$attribute] = $value;
500  }
501 
502  # itemtype, itemid, itemref don't make sense without itemscope
503  if ( !array_key_exists( 'itemscope', $out ) ) {
504  unset( $out['itemtype'] );
505  unset( $out['itemid'] );
506  unset( $out['itemref'] );
507  }
508  # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
509 
510  return $out;
511  }
512 
520  public static function isReservedDataAttribute( $attr ) {
521  // data-ooui is reserved for ooui.
522  // data-mw and data-parsoid are reserved for parsoid.
523  // data-mw-<name here> is reserved for extensions (or core) if
524  // they need to communicate some data to the client and want to be
525  // sure that it isn't coming from an untrusted user.
526  // We ignore the possibility of namespaces since user-generated HTML
527  // can't use them anymore.
528  return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
529  }
530 
541  public static function mergeAttributes( $a, $b ) {
542  $out = array_merge( $a, $b );
543  if ( isset( $a['class'] ) && isset( $b['class'] )
544  && is_string( $a['class'] ) && is_string( $b['class'] )
545  && $a['class'] !== $b['class']
546  ) {
547  $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
548  -1, PREG_SPLIT_NO_EMPTY );
549  $out['class'] = implode( ' ', array_unique( $classes ) );
550  }
551  return $out;
552  }
553 
562  public static function normalizeCss( $value ) {
563  // Decode character references like &#123;
564  $value = self::decodeCharReferences( $value );
565 
566  // Decode escape sequences and line continuation
567  // See the grammar in the CSS 2 spec, appendix D.
568  // This has to be done AFTER decoding character references.
569  // This means it isn't possible for this function to return
570  // unsanitized escape sequences. It is possible to manufacture
571  // input that contains character references that decode to
572  // escape sequences that decode to character references, but
573  // it's OK for the return value to contain character references
574  // because the caller is supposed to escape those anyway.
575  static $decodeRegex;
576  if ( !$decodeRegex ) {
577  $space = '[\\x20\\t\\r\\n\\f]';
578  $nl = '(?:\\n|\\r\\n|\\r|\\f)';
579  $backslash = '\\\\';
580  $decodeRegex = "/ $backslash
581  (?:
582  ($nl) | # 1. Line continuation
583  ([0-9A-Fa-f]{1,6})$space? | # 2. character number
584  (.) | # 3. backslash cancelling special meaning
585  () | # 4. backslash at end of string
586  )/xu";
587  }
588  $value = preg_replace_callback( $decodeRegex,
589  [ __CLASS__, 'cssDecodeCallback' ], $value );
590 
591  // Let the value through if it's nothing but a single comment, to
592  // allow other functions which may reject it to pass some error
593  // message through.
594  if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
595  // Remove any comments; IE gets token splitting wrong
596  // This must be done AFTER decoding character references and
597  // escape sequences, because those steps can introduce comments
598  // This step cannot introduce character references or escape
599  // sequences, because it replaces comments with spaces rather
600  // than removing them completely.
601  $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
602 
603  // Remove anything after a comment-start token, to guard against
604  // incorrect client implementations.
605  $commentPos = strpos( $value, '/*' );
606  if ( $commentPos !== false ) {
607  $value = substr( $value, 0, $commentPos );
608  }
609  }
610 
611  return $value;
612  }
613 
632  public static function checkCss( $value ) {
633  $value = self::normalizeCss( $value );
634 
635  // Reject problematic keywords and control characters
636  if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
637  strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
638  return '/* invalid control char */';
639  } elseif ( preg_match(
640  '! expression
641  | filter\s*:
642  | accelerator\s*:
643  | -o-link\s*:
644  | -o-link-source\s*:
645  | -o-replace\s*:
646  | url\s*\‍(
647  | image\s*\‍(
648  | image-set\s*\‍(
649  | attr\s*\‍([^)]+[\s,]+url
650  | var\s*\‍(
651  !ix', $value ) ) {
652  return '/* insecure input */';
653  }
654  return $value;
655  }
656 
661  private static function cssDecodeCallback( $matches ) {
662  if ( $matches[1] !== '' ) {
663  // Line continuation
664  return '';
665  } elseif ( $matches[2] !== '' ) {
666  $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
667  } elseif ( $matches[3] !== '' ) {
668  $char = $matches[3];
669  } else {
670  $char = '\\';
671  }
672  if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
673  // These characters need to be escaped in strings
674  // Clean up the escape sequence to avoid parsing errors by clients
675  return '\\' . dechex( ord( $char ) ) . ' ';
676  } else {
677  // Decode unnecessary escape
678  return $char;
679  }
680  }
681 
703  public static function fixTagAttributes( $text, $element, $sorted = false ) {
704  if ( trim( $text ) == '' ) {
705  return '';
706  }
707 
708  $decoded = self::decodeTagAttributes( $text );
709  $stripped = self::validateTagAttributes( $decoded, $element );
710 
711  if ( $sorted ) {
712  ksort( $stripped );
713  }
714 
715  return self::safeEncodeTagAttributes( $stripped );
716  }
717 
723  public static function encodeAttribute( $text ) {
724  $encValue = htmlspecialchars( $text, ENT_QUOTES );
725 
726  // Whitespace is normalized during attribute decoding,
727  // so if we've been passed non-spaces we must encode them
728  // ahead of time or they won't be preserved.
729  $encValue = strtr( $encValue, [
730  "\n" => '&#10;',
731  "\r" => '&#13;',
732  "\t" => '&#9;',
733  ] );
734 
735  return $encValue;
736  }
737 
746  public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
747  // Replace $ with \$ and \ with \\
748  $space = preg_replace( '#(?<!\\\\‍)(\\$|\\\\‍)#', '\\\\$1', $space );
749  $fixtags = [
750  # French spaces, last one Guillemet-left
751  # only if there is something before the space
752  # and a non-word character after the punctuation.
753  '/(?:(?<=\S)|^) (?=[?:;!%»›](?!\w))/u' => "$space",
754  # French spaces, Guillemet-right
755  '/([«‹]) /u' => "\\1$space",
756  ];
757  return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
758  }
759 
766  public static function safeEncodeAttribute( $text ) {
767  $encValue = self::encodeAttribute( $text );
768 
769  # Templates and links may be expanded in later parsing,
770  # creating invalid or dangerous output. Suppress this.
771  $encValue = strtr( $encValue, [
772  '<' => '&lt;', // This should never happen,
773  '>' => '&gt;', // we've received invalid input
774  '"' => '&quot;', // which should have been escaped.
775  '{' => '&#123;',
776  '}' => '&#125;', // prevent unpaired language conversion syntax
777  '[' => '&#91;',
778  ']' => '&#93;',
779  "''" => '&#39;&#39;',
780  'ISBN' => '&#73;SBN',
781  'RFC' => '&#82;FC',
782  'PMID' => '&#80;MID',
783  '|' => '&#124;',
784  '__' => '&#95;_',
785  ] );
786 
787  # Stupid hack
788  $encValue = preg_replace_callback(
789  '/((?i)' . wfUrlProtocols() . ')/',
790  static function ( $matches ) {
791  return str_replace( ':', '&#58;', $matches[1] );
792  },
793  $encValue );
794  return $encValue;
795  }
796 
812  public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
813  global $wgFragmentMode;
814 
815  if ( !isset( $wgFragmentMode[$mode] ) ) {
816  if ( $mode === self::ID_PRIMARY ) {
817  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
818  }
819  return false;
820  }
821 
822  $internalMode = $wgFragmentMode[$mode];
823 
824  return self::escapeIdInternal( $id, $internalMode );
825  }
826 
839  public static function escapeIdForLink( $id ) {
840  global $wgFragmentMode;
841 
842  if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
843  throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
844  }
845 
847 
848  $id = self::escapeIdInternalUrl( $id, $mode );
849 
850  return $id;
851  }
852 
862  public static function escapeIdForExternalInterwiki( $id ) {
864 
866 
867  return $id;
868  }
869 
879  private static function escapeIdInternalUrl( $id, $mode ) {
880  $id = self::escapeIdInternal( $id, $mode );
881  if ( $mode === 'html5' ) {
882  $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
883  }
884  return $id;
885  }
886 
894  private static function escapeIdInternal( $id, $mode ) {
895  // Truncate overly-long IDs. This isn't an HTML limit, it's just
896  // griefer protection. [T251506]
897  $id = mb_substr( $id, 0, 1024 );
898 
899  switch ( $mode ) {
900  case 'html5':
901  // html5 spec says ids must not have any of the following:
902  // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
903  // In practice, in wikitext, only tab, LF, CR (and SPACE) are
904  // possible using either Lua or html entities.
905  $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
906  break;
907  case 'legacy':
908  // This corresponds to 'noninitial' mode of the former escapeId()
909  static $replace = [
910  '%3A' => ':',
911  '%' => '.'
912  ];
913 
914  $id = urlencode( str_replace( ' ', '_', $id ) );
915  $id = strtr( $id, $replace );
916  break;
917  default:
918  throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
919  }
920 
921  return $id;
922  }
923 
934  public static function escapeIdReferenceList( $referenceString ) {
935  wfDeprecated( __METHOD__, '1.36' );
936  return self::escapeIdReferenceListInternal( $referenceString );
937  }
938 
946  private static function escapeIdReferenceListInternal( $referenceString ) {
947  # Explode the space delimited list string into an array of tokens
948  $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
949 
950  # Escape each token as an id
951  foreach ( $references as &$ref ) {
952  $ref = self::escapeIdForAttribute( $ref );
953  }
954 
955  # Merge the array back to a space delimited list string
956  # If the array is empty, the result will be an empty string ('')
957  $referenceString = implode( ' ', $references );
958 
959  return $referenceString;
960  }
961 
973  public static function escapeClass( $class ) {
974  // Convert ugly stuff to underscores and kill underscores in ugly places
975  return rtrim( preg_replace(
976  [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
977  '_',
978  $class ), '_' );
979  }
980 
988  public static function escapeHtmlAllowEntities( $html ) {
989  $html = self::decodeCharReferences( $html );
990  # It seems wise to escape ' as well as ", as a matter of course. Can't
991  # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
992  # don't cause the entire string to disappear.
993  $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
994  return $html;
995  }
996 
1005  public static function decodeTagAttributes( $text ) {
1006  if ( trim( $text ) == '' ) {
1007  return [];
1008  }
1009 
1010  $pairs = [];
1011  if ( !preg_match_all(
1012  self::getAttribsRegex(),
1013  $text,
1014  $pairs,
1015  PREG_SET_ORDER ) ) {
1016  return [];
1017  }
1018 
1019  $attribs = [];
1020  foreach ( $pairs as $set ) {
1021  $attribute = strtolower( $set[1] );
1022 
1023  // Filter attribute names with unacceptable characters
1024  if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1025  continue;
1026  }
1027 
1028  $value = self::getTagAttributeCallback( $set );
1029 
1030  // Normalize whitespace
1031  $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1032  $value = trim( $value );
1033 
1034  // Decode character references
1035  $attribs[$attribute] = self::decodeCharReferences( $value );
1036  }
1037  return $attribs;
1038  }
1039 
1047  public static function safeEncodeTagAttributes( $assoc_array ) {
1048  $attribs = [];
1049  foreach ( $assoc_array as $attribute => $value ) {
1050  $encAttribute = htmlspecialchars( $attribute );
1051  $encValue = self::safeEncodeAttribute( $value );
1052 
1053  $attribs[] = "$encAttribute=\"$encValue\"";
1054  }
1055  return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1056  }
1057 
1066  private static function getTagAttributeCallback( $set ) {
1067  if ( isset( $set[5] ) ) {
1068  # No quotes.
1069  return $set[5];
1070  } elseif ( isset( $set[4] ) ) {
1071  # Single-quoted
1072  return $set[4];
1073  } elseif ( isset( $set[3] ) ) {
1074  # Double-quoted
1075  return $set[3];
1076  } elseif ( !isset( $set[2] ) ) {
1077  # In XHTML, attributes must have a value so return an empty string.
1078  # See "Empty attribute syntax",
1079  # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1080  return "";
1081  } else {
1082  throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1083  }
1084  }
1085 
1090  private static function normalizeWhitespace( $text ) {
1091  return trim( preg_replace(
1092  '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1093  ' ',
1094  $text ) );
1095  }
1096 
1105  public static function normalizeSectionNameWhitespace( $section ) {
1106  return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1107  }
1108 
1124  public static function normalizeCharReferences( $text ) {
1125  return preg_replace_callback(
1126  self::CHAR_REFS_REGEX,
1127  [ self::class, 'normalizeCharReferencesCallback' ],
1128  $text );
1129  }
1130 
1135  private static function normalizeCharReferencesCallback( $matches ) {
1136  $ret = null;
1137  if ( $matches[1] != '' ) {
1138  $ret = self::normalizeEntity( $matches[1] );
1139  } elseif ( $matches[2] != '' ) {
1140  $ret = self::decCharReference( $matches[2] );
1141  } elseif ( $matches[3] != '' ) {
1142  $ret = self::hexCharReference( $matches[3] );
1143  }
1144  if ( $ret === null ) {
1145  return htmlspecialchars( $matches[0] );
1146  } else {
1147  return $ret;
1148  }
1149  }
1150 
1161  private static function normalizeEntity( $name ) {
1162  if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1163  // Non-standard MediaWiki-specific entities
1164  return '&' . self::MW_ENTITY_ALIASES[$name];
1165  } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
1166  // Keep these in word form
1167  return "&$name";
1168  } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1169  // Beware: some entities expand to more than 1 codepoint
1170  return preg_replace_callback( '/./Ssu', static function ( $m ) {
1171  return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
1172  }, HTMLData::$namedEntityTranslations[$name] );
1173  } else {
1174  return "&amp;$name";
1175  }
1176  }
1177 
1182  private static function decCharReference( $codepoint ) {
1183  $point = intval( $codepoint );
1184  if ( self::validateCodepoint( $point ) ) {
1185  return sprintf( '&#%d;', $point );
1186  } else {
1187  return null;
1188  }
1189  }
1190 
1195  private static function hexCharReference( $codepoint ) {
1196  $point = hexdec( $codepoint );
1197  if ( self::validateCodepoint( $point ) ) {
1198  return sprintf( '&#x%x;', $point );
1199  } else {
1200  return null;
1201  }
1202  }
1203 
1210  private static function validateCodepoint( $codepoint ) {
1211  # U+000C is valid in HTML5 but not allowed in XML.
1212  # U+000D is valid in XML but not allowed in HTML5.
1213  # U+007F - U+009F are disallowed in HTML5 (control characters).
1214  return $codepoint == 0x09
1215  || $codepoint == 0x0a
1216  || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1217  || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1218  || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1219  || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1220  }
1221 
1229  public static function decodeCharReferences( $text ) {
1230  return preg_replace_callback(
1231  self::CHAR_REFS_REGEX,
1232  [ self::class, 'decodeCharReferencesCallback' ],
1233  $text );
1234  }
1235 
1246  public static function decodeCharReferencesAndNormalize( $text ) {
1247  $text = preg_replace_callback(
1248  self::CHAR_REFS_REGEX,
1249  [ self::class, 'decodeCharReferencesCallback' ],
1250  $text,
1251  -1, // limit
1252  $count
1253  );
1254 
1255  if ( $count ) {
1256  return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1257  } else {
1258  return $text;
1259  }
1260  }
1261 
1266  private static function decodeCharReferencesCallback( $matches ) {
1267  if ( $matches[1] != '' ) {
1268  return self::decodeEntity( $matches[1] );
1269  } elseif ( $matches[2] != '' ) {
1270  return self::decodeChar( intval( $matches[2] ) );
1271  } elseif ( $matches[3] != '' ) {
1272  return self::decodeChar( hexdec( $matches[3] ) );
1273  }
1274  # Last case should be an ampersand by itself
1275  return $matches[0];
1276  }
1277 
1285  private static function decodeChar( $codepoint ) {
1286  if ( self::validateCodepoint( $codepoint ) ) {
1287  return UtfNormal\Utils::codepointToUtf8( $codepoint );
1288  } else {
1289  return UtfNormal\Constants::UTF8_REPLACEMENT;
1290  }
1291  }
1292 
1301  private static function decodeEntity( $name ) {
1302  // These are MediaWiki-specific entities, not in the HTML standard
1303  if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1304  $name = self::MW_ENTITY_ALIASES[$name];
1305  }
1306  $trans = HTMLData::$namedEntityTranslations[$name] ?? null;
1307  return $trans ?? "&$name";
1308  }
1309 
1317  private static function attributesAllowedInternal( $element ) {
1319  return $list[$element] ?? [];
1320  }
1321 
1329  private static function setupAttributesAllowedInternal() {
1330  static $allowed;
1331 
1332  if ( $allowed !== null ) {
1333  return $allowed;
1334  }
1335 
1336  // For lookup efficiency flip each attributes array so the keys are
1337  // the valid attributes.
1338  $merge = static function ( $a, $b, $c = [] ) {
1339  return array_merge( $a, array_flip( $b ), array_flip( $c ) );
1340  };
1341  $common = $merge( [], [
1342  # HTML
1343  'id',
1344  'class',
1345  'style',
1346  'lang',
1347  'dir',
1348  'title',
1349  'tabindex',
1350 
1351  # WAI-ARIA
1352  'aria-describedby',
1353  'aria-flowto',
1354  'aria-hidden',
1355  'aria-label',
1356  'aria-labelledby',
1357  'aria-owns',
1358  'role',
1359 
1360  # RDFa
1361  # These attributes are specified in section 9 of
1362  # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1363  'about',
1364  'property',
1365  'resource',
1366  'datatype',
1367  'typeof',
1368 
1369  # Microdata. These are specified by
1370  # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1371  'itemid',
1372  'itemprop',
1373  'itemref',
1374  'itemscope',
1375  'itemtype',
1376  ] );
1377 
1378  $block = $merge( $common, [ 'align' ] );
1379 
1380  $tablealign = [ 'align', 'valign' ];
1381  $tablecell = [
1382  'abbr',
1383  'axis',
1384  'headers',
1385  'scope',
1386  'rowspan',
1387  'colspan',
1388  'nowrap', # deprecated
1389  'width', # deprecated
1390  'height', # deprecated
1391  'bgcolor', # deprecated
1392  ];
1393 
1394  # Numbers refer to sections in HTML 4.01 standard describing the element.
1395  # See: https://www.w3.org/TR/html4/
1396  $allowed = [
1397  # 7.5.4
1398  'div' => $block,
1399  'center' => $common, # deprecated
1400  'span' => $common,
1401 
1402  # 7.5.5
1403  'h1' => $block,
1404  'h2' => $block,
1405  'h3' => $block,
1406  'h4' => $block,
1407  'h5' => $block,
1408  'h6' => $block,
1409 
1410  # 7.5.6
1411  # address
1412 
1413  # 8.2.4
1414  'bdo' => $common,
1415 
1416  # 9.2.1
1417  'em' => $common,
1418  'strong' => $common,
1419  'cite' => $common,
1420  'dfn' => $common,
1421  'code' => $common,
1422  'samp' => $common,
1423  'kbd' => $common,
1424  'var' => $common,
1425  'abbr' => $common,
1426  # acronym
1427 
1428  # 9.2.2
1429  'blockquote' => $merge( $common, [ 'cite' ] ),
1430  'q' => $merge( $common, [ 'cite' ] ),
1431 
1432  # 9.2.3
1433  'sub' => $common,
1434  'sup' => $common,
1435 
1436  # 9.3.1
1437  'p' => $block,
1438 
1439  # 9.3.2
1440  'br' => $merge( $common, [ 'clear' ] ),
1441 
1442  # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1443  'wbr' => $common,
1444 
1445  # 9.3.4
1446  'pre' => $merge( $common, [ 'width' ] ),
1447 
1448  # 9.4
1449  'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1450  'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1451 
1452  # 10.2
1453  'ul' => $merge( $common, [ 'type' ] ),
1454  'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1455  'li' => $merge( $common, [ 'type', 'value' ] ),
1456 
1457  # 10.3
1458  'dl' => $common,
1459  'dd' => $common,
1460  'dt' => $common,
1461 
1462  # 11.2.1
1463  'table' => $merge( $common,
1464  [ 'summary', 'width', 'border', 'frame',
1465  'rules', 'cellspacing', 'cellpadding',
1466  'align', 'bgcolor',
1467  ] ),
1468 
1469  # 11.2.2
1470  'caption' => $block,
1471 
1472  # 11.2.3
1473  'thead' => $common,
1474  'tfoot' => $common,
1475  'tbody' => $common,
1476 
1477  # 11.2.4
1478  'colgroup' => $merge( $common, [ 'span' ] ),
1479  'col' => $merge( $common, [ 'span' ] ),
1480 
1481  # 11.2.5
1482  'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1483 
1484  # 11.2.6
1485  'td' => $merge( $common, $tablecell, $tablealign ),
1486  'th' => $merge( $common, $tablecell, $tablealign ),
1487 
1488  # 12.2
1489  # NOTE: <a> is not allowed directly, but this list of allowed
1490  # attributes is used from the Parser object
1491  'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1492 
1493  # 13.2
1494  # Not usually allowed, but may be used for extension-style hooks
1495  # such as <math> when it is rasterized, or if $wgAllowImageTag is
1496  # true
1497  'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1498  # Attributes for A/V tags added in T163583 / T133673
1499  'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1500  'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1501  'source' => $merge( $common, [ 'type', 'src' ] ),
1502  'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1503 
1504  # 15.2.1
1505  'tt' => $common,
1506  'b' => $common,
1507  'i' => $common,
1508  'big' => $common,
1509  'small' => $common,
1510  'strike' => $common,
1511  's' => $common,
1512  'u' => $common,
1513 
1514  # 15.2.2
1515  'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1516  # basefont
1517 
1518  # 15.3
1519  'hr' => $merge( $common, [ 'width' ] ),
1520 
1521  # HTML Ruby annotation text module, simple ruby only.
1522  # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1523  'ruby' => $common,
1524  # rbc
1525  'rb' => $common,
1526  'rp' => $common,
1527  'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1528  'rtc' => $common,
1529 
1530  # MathML root element, where used for extensions
1531  # 'title' may not be 100% valid here; it's XHTML
1532  # https://www.w3.org/TR/REC-MathML/
1533  'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1534 
1535  // HTML 5 section 4.5
1536  'figure' => $common,
1537  'figcaption' => $common,
1538 
1539  # HTML 5 section 4.6
1540  'bdi' => $common,
1541 
1542  # HTML5 elements, defined by:
1543  # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1544  'data' => $merge( $common, [ 'value' ] ),
1545  'time' => $merge( $common, [ 'datetime' ] ),
1546  'mark' => $common,
1547 
1548  // meta and link are only permitted by removeHTMLtags when Microdata
1549  // is enabled so we don't bother adding a conditional to hide these
1550  // Also meta and link are only valid in WikiText as Microdata elements
1551  // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1552  // So we don't bother including $common attributes that have no purpose.
1553  'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1554  'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1555  ];
1556 
1557  return $allowed;
1558  }
1559 
1571  public static function stripAllTags( $html ) {
1572  // Use RemexHtml to tokenize $html and extract the text
1573  $handler = new RemexStripTagHandler;
1574  $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
1575  'ignoreErrors' => true,
1576  // don't ignore char refs, we want them to be decoded
1577  'ignoreNulls' => true,
1578  'skipPreprocess' => true,
1579  ] );
1580  $tokenizer->execute();
1581  $text = $handler->getResult();
1582 
1583  $text = self::normalizeWhitespace( $text );
1584  return $text;
1585  }
1586 
1598  public static function hackDocType() {
1599  $out = "<!DOCTYPE html [\n";
1600  foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1601  if ( substr( $entity, -1 ) !== ';' ) {
1602  // Some HTML entities omit the trailing semicolon;
1603  // wikitext does not permit these.
1604  continue;
1605  }
1606  $name = substr( $entity, 0, -1 );
1607  $expansion = self::normalizeEntity( $entity );
1608  if ( $entity === $expansion ) {
1609  // Skip &lt; &gt; etc
1610  continue;
1611  }
1612  $out .= "<!ENTITY $name \"$expansion\">";
1613  }
1614  $out .= "]>\n";
1615  return $out;
1616  }
1617 
1622  public static function cleanUrl( $url ) {
1623  # Normalize any HTML entities in input. They will be
1624  # re-escaped by makeExternalLink().
1625  $url = self::decodeCharReferences( $url );
1626 
1627  # Escape any control characters introduced by the above step
1628  $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1629  [ __CLASS__, 'cleanUrlCallback' ], $url );
1630 
1631  # Validate hostname portion
1632  $matches = [];
1633  if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1634  list( /* $whole */, $protocol, $host, $rest ) = $matches;
1635 
1636  // Characters that will be ignored in IDNs.
1637  // https://tools.ietf.org/html/rfc3454#section-3.1
1638  // Strip them before further processing so deny lists and such work.
1639  $strip = "/
1640  \\s| # general whitespace
1641  \xc2\xad| # 00ad SOFT HYPHEN
1642  \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1643  \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1644  \xe2\x81\xa0| # 2060 WORD JOINER
1645  \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1646  \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1647  \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1648  \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1649  \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1650  \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1651  \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1652  [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
1653  /xuD";
1654 
1655  $host = preg_replace( $strip, '', $host );
1656 
1657  // IPv6 host names are bracketed with []. Url-decode these.
1658  if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 &&
1659  preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1660  ) {
1661  $host = '//[' . $matches[1] . ']' . $matches[2];
1662  }
1663 
1664  // @todo FIXME: Validate hostnames here
1665 
1666  return $protocol . $host . $rest;
1667  } else {
1668  return $url;
1669  }
1670  }
1671 
1676  private static function cleanUrlCallback( $matches ) {
1677  return urlencode( $matches[0] );
1678  }
1679 
1708  public static function validateEmail( $addr ) {
1709  $result = null;
1710  // TODO This method should be non-static, and have a HookRunner injected
1711  if ( !Hooks::runner()->onIsValidEmailAddr( $addr, $result ) ) {
1712  return $result;
1713  }
1714 
1715  // Please note strings below are enclosed in brackets [], this make the
1716  // hyphen "-" a range indicator. Hence it is double backslashed below.
1717  // See T28948
1718  $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1719  $rfc1034_ldh_str = "a-z0-9\\-";
1720 
1721  $html5_email_regexp = "/
1722  ^ # start of string
1723  [$rfc5322_atext\\.]+ # user part which is liberal :p
1724  @ # 'apostrophe'
1725  [$rfc1034_ldh_str]+ # First domain part
1726  (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1727  $ # End of string
1728  /ix"; // case Insensitive, eXtended
1729 
1730  return (bool)preg_match( $html5_email_regexp, $addr );
1731  }
1732 }
Sanitizer\ID_FALLBACK
const ID_FALLBACK
Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false if no fallback...
Definition: Sanitizer.php:78
Sanitizer\normalizeEntity
static normalizeEntity( $name)
If the named entity is defined in HTML5 return the equivalent numeric entity reference (except for th...
Definition: Sanitizer.php:1161
Sanitizer\getTagAttributeCallback
static getTagAttributeCallback( $set)
Pick the appropriate attribute value from a match set from the attribs regex matches.
Definition: Sanitizer.php:1066
Sanitizer\removeHTMLcomments
static removeHTMLcomments( $text)
Remove '', and everything between.
Definition: Sanitizer.php:308
Sanitizer\stripAllTags
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed,...
Definition: Sanitizer.php:1571
Sanitizer\EVIL_URI_PATTERN
const EVIL_URI_PATTERN
Pattern matching evil uris like javascript: WARNING: DO NOT use this in any place that actually requi...
Definition: Sanitizer.php:62
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:172
$wgExternalInterwikiFragmentMode
$wgExternalInterwikiFragmentMode
Which ID escaping mode should be used for external interwiki links? See documentation for $wgFragment...
Definition: DefaultSettings.php:3679
Sanitizer\escapeIdForAttribute
static escapeIdForAttribute( $id, $mode=self::ID_PRIMARY)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid HTM...
Definition: Sanitizer.php:812
Sanitizer\removeHTMLtags
static removeHTMLtags( $text, $processCallback=null, $args=[], $extratags=[], $removetags=[])
Cleans up HTML, removes dangerous tags and attributes, and removes HTML comments.
Definition: Sanitizer.php:239
Sanitizer\decodeEntity
static decodeEntity( $name)
If the named entity is defined in HTML5 return the UTF-8 encoding of that character.
Definition: Sanitizer.php:1301
Sanitizer\mergeAttributes
static mergeAttributes( $a, $b)
Merge two sets of HTML attributes.
Definition: Sanitizer.php:541
Sanitizer\validateAttributes
static validateAttributes( $attribs, $allowed)
Take an array of attribute names and values and normalize or discard illegal values.
Definition: Sanitizer.php:413
Sanitizer\normalizeSectionNameWhitespace
static normalizeSectionNameWhitespace( $section)
Normalizes whitespace in a section name, such as might be returned by Parser::stripSectionName(),...
Definition: Sanitizer.php:1105
Sanitizer\escapeIdInternal
static escapeIdInternal( $id, $mode)
Helper for escapeIdFor*() functions.
Definition: Sanitizer.php:894
Sanitizer\validateEmail
static validateEmail( $addr)
Does a string look like an e-mail address?
Definition: Sanitizer.php:1708
Sanitizer\attributesAllowedInternal
static attributesAllowedInternal( $element)
Fetch the list of acceptable attributes for a given element name.
Definition: Sanitizer.php:1317
Sanitizer\decCharReference
static decCharReference( $codepoint)
Definition: Sanitizer.php:1182
Sanitizer\safeEncodeTagAttributes
static safeEncodeTagAttributes( $assoc_array)
Build a partial tag string from an associative array of attribute names and values as returned by dec...
Definition: Sanitizer.php:1047
Sanitizer\decodeCharReferencesAndNormalize
static decodeCharReferencesAndNormalize( $text)
Decode any character references, numeric or named entities, in the next and normalize the resulting s...
Definition: Sanitizer.php:1246
Sanitizer\$attribsRegex
static $attribsRegex
Lazy-initialised attributes regex, see getAttribsRegex()
Definition: Sanitizer.php:92
$wgFragmentMode
$wgFragmentMode
How should section IDs be encoded? This array can contain 1 or 2 elements, each of them can be one of...
Definition: DefaultSettings.php:3669
Sanitizer\escapeClass
static escapeClass( $class)
Given a value, escape it so that it can be used as a CSS class and return it.
Definition: Sanitizer.php:973
Sanitizer\normalizeCharReferencesCallback
static normalizeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1135
Sanitizer\armorFrenchSpaces
static armorFrenchSpaces( $text, $space='&#160;')
Armor French spaces with a replacement character.
Definition: Sanitizer.php:746
RemexStripTagHandler
Definition: RemexStripTagHandler.php:9
Sanitizer\validateTag
static validateTag( $params, $element)
Takes attribute names and values for a tag and the tag name and validates that the tag is allowed to ...
Definition: Sanitizer.php:354
Sanitizer\$attribNameRegex
static $attribNameRegex
Lazy-initialised attribute name regex, see getAttribNameRegex()
Definition: Sanitizer.php:123
Sanitizer\hackDocType
static hackDocType()
Hack up a private DOCTYPE with HTML's standard entity declarations.
Definition: Sanitizer.php:1598
Sanitizer\XMLNS_ATTRIBUTE_PATTERN
const XMLNS_ATTRIBUTE_PATTERN
Definition: Sanitizer.php:63
$wgAllowImageTag
$wgAllowImageTag
A different approach to the above: simply allow the "<img>" tag to be used.
Definition: DefaultSettings.php:4731
wfDeprecatedMsg
wfDeprecatedMsg( $msg, $version=false, $component=false, $callerOffset=2)
Log a deprecation warning with arbitrary message text.
Definition: GlobalFunctions.php:1066
MWException
MediaWiki exception.
Definition: MWException.php:29
wfDeprecated
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that $function is deprecated.
Definition: GlobalFunctions.php:1034
Sanitizer\safeEncodeAttribute
static safeEncodeAttribute( $text)
Encode an attribute value for HTML tags, with extra armoring against further wiki processing.
Definition: Sanitizer.php:766
$matches
$matches
Definition: NoLocalSettings.php:24
Sanitizer\encodeAttribute
static encodeAttribute( $text)
Encode an attribute value for HTML output.
Definition: Sanitizer.php:723
$args
if( $line===false) $args
Definition: mcc.php:124
Sanitizer\escapeIdReferenceListInternal
static escapeIdReferenceListInternal( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
Definition: Sanitizer.php:946
Sanitizer\hexCharReference
static hexCharReference( $codepoint)
Definition: Sanitizer.php:1195
Sanitizer\validateCodepoint
static validateCodepoint( $codepoint)
Returns true if a given Unicode codepoint is a valid character in both HTML5 and XML.
Definition: Sanitizer.php:1210
wfUrlProtocols
wfUrlProtocols( $includeProtocolRelative=true)
Returns a regular expression of url protocols.
Definition: GlobalFunctions.php:723
Sanitizer\MW_ENTITY_ALIASES
const MW_ENTITY_ALIASES
Character entity aliases accepted by MediaWiki in wikitext.
Definition: Sanitizer.php:84
Sanitizer\validateTagAttributes
static validateTagAttributes( $attribs, $element)
Take an array of attribute names and values and normalize or discard illegal values for the given ele...
Definition: Sanitizer.php:390
Sanitizer\ELEMENT_BITS_REGEX
const ELEMENT_BITS_REGEX
Acceptable tag name charset from HTML5 parsing spec https://www.w3.org/TR/html5/syntax....
Definition: Sanitizer.php:51
Hooks\runner
static runner()
Get a HookRunner instance for calling hooks using the new interfaces.
Definition: Hooks.php:172
Sanitizer\escapeIdForExternalInterwiki
static escapeIdForExternalInterwiki( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:862
Sanitizer\isReservedDataAttribute
static isReservedDataAttribute( $attr)
Given an attribute name, checks whether it is a reserved data attribute (such as data-mw-foo) which i...
Definition: Sanitizer.php:520
Sanitizer\cleanUrl
static cleanUrl( $url)
Definition: Sanitizer.php:1622
Sanitizer\setupAttributesAllowedInternal
static setupAttributesAllowedInternal()
Foreach array key (an allowed HTML element), return an array of allowed attributes.
Definition: Sanitizer.php:1329
Sanitizer\cssDecodeCallback
static cssDecodeCallback( $matches)
Definition: Sanitizer.php:661
Sanitizer\getRecognizedTagData
static getRecognizedTagData( $extratags=[], $removetags=[])
Return the various lists of recognized tags.
Definition: Sanitizer.php:144
Sanitizer\fixTagAttributes
static fixTagAttributes( $text, $element, $sorted=false)
Take a tag soup fragment listing an HTML element's attributes and normalize it to well-formed XML,...
Definition: Sanitizer.php:703
Sanitizer\escapeIdInternalUrl
static escapeIdInternalUrl( $id, $mode)
Do percent encoding of percent signs for href (but not id) attributes.
Definition: Sanitizer.php:879
Sanitizer\normalizeCss
static normalizeCss( $value)
Normalize CSS into a format we can easily search for hostile input.
Definition: Sanitizer.php:562
Sanitizer\ID_PRIMARY
const ID_PRIMARY
Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
Definition: Sanitizer.php:70
Sanitizer\CHAR_REFS_REGEX
const CHAR_REFS_REGEX
Regular expression to match various types of character references in Sanitizer::normalizeCharReferenc...
Definition: Sanitizer.php:41
Sanitizer\normalizeWhitespace
static normalizeWhitespace( $text)
Definition: Sanitizer.php:1090
Sanitizer\decodeChar
static decodeChar( $codepoint)
Return UTF-8 string for a codepoint if that is a valid character reference, otherwise U+FFFD REPLACEM...
Definition: Sanitizer.php:1285
Sanitizer\getAttribNameRegex
static getAttribNameRegex()
Used in Sanitizer::decodeTagAttributes to filter attributes.
Definition: Sanitizer.php:129
StringUtils\delimiterReplace
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
Definition: StringUtils.php:248
Sanitizer\normalizeCharReferences
static normalizeCharReferences( $text)
Ensure that any entities and character references are legal for XML and XHTML specifically.
Definition: Sanitizer.php:1124
Sanitizer\escapeIdForLink
static escapeIdForLink( $id)
Given a section name or other user-generated or otherwise unsafe string, escapes it to be a valid URL...
Definition: Sanitizer.php:839
Sanitizer\decodeTagAttributes
static decodeTagAttributes( $text)
Return an associative array of attribute names and values from a partial tag string.
Definition: Sanitizer.php:1005
$t
$t
Definition: testCompression.php:74
Sanitizer\decodeCharReferences
static decodeCharReferences( $text)
Decode any character references, numeric or named entities, in the text and return a UTF-8 string.
Definition: Sanitizer.php:1229
Sanitizer
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:34
Sanitizer\checkCss
static checkCss( $value)
Pick apart some CSS and check it for forbidden or unsafe structures.
Definition: Sanitizer.php:632
Sanitizer\escapeIdReferenceList
static escapeIdReferenceList( $referenceString)
Given a string containing a space delimited list of ids, escape each id to match ids escaped by the e...
Definition: Sanitizer.php:934
Sanitizer\getAttribsRegex
static getAttribsRegex()
Regular expression to match HTML/XML attribute pairs within a tag.
Definition: Sanitizer.php:100
Sanitizer\escapeHtmlAllowEntities
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:988
Sanitizer\decodeCharReferencesCallback
static decodeCharReferencesCallback( $matches)
Definition: Sanitizer.php:1266
Sanitizer\cleanUrlCallback
static cleanUrlCallback( $matches)
Definition: Sanitizer.php:1676